src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
commit	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree	599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
parent	1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)

vendor/llvm/llvm-trunk-r366426

Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp')

-rw-r--r--

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

136

1 files changed, 131 insertions, 5 deletions

diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4dc1e67c573d..b750c6b5f6d2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

@@ -1,9 +1,8 @@

//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

//===----------------------------------------------------------------------===//

@@ -62,6 +61,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,

AssumptionCache *AC = nullptr;

LegacyDivergenceAnalysis *DA = nullptr;

Module *Mod = nullptr;

+ const DataLayout *DL = nullptr;

bool HasUnsafeFPMath = false;

/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to

@@ -134,6 +134,16 @@ class AMDGPUCodeGenPrepare : public FunctionPass,

/// \returns True.

bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;

+ unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;

+ unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;

+ bool isI24(Value *V, unsigned ScalarSize) const;

+ bool isU24(Value *V, unsigned ScalarSize) const;

+ /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.

+ /// SelectionDAG has an issue where an and asserting the bits are known

+ bool replaceMulWithMul24(BinaryOperator &I) const;

/// Expands 24 bit div or rem.

Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,

Value *Num, Value *Den,

@@ -393,6 +403,118 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(

return true;

}

+unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,

+ unsigned ScalarSize) const {

+ KnownBits Known = computeKnownBits(Op, *DL, 0, AC);

+ return ScalarSize - Known.countMinLeadingZeros();

+unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,

+ unsigned ScalarSize) const {

+ // In order for this to be a signed 24-bit value, bit 23, must

+ // be a sign bit.

+ return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);

+bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {

+ return ScalarSize >= 24 && // Types less than 24-bit should be treated

+ // as unsigned 24-bit values.

+ numBitsSigned(V, ScalarSize) < 24;

+bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {

+ return numBitsUnsigned(V, ScalarSize) <= 24;

+static void extractValues(IRBuilder<> &Builder,

+ SmallVectorImpl<Value *> &Values, Value *V) {

+ VectorType *VT = dyn_cast<VectorType>(V->getType());

+ if (!VT) {

+ Values.push_back(V);

+ return;

+ }

+ for (int I = 0, E = VT->getNumElements(); I != E; ++I)

+ Values.push_back(Builder.CreateExtractElement(V, I));

+static Value *insertValues(IRBuilder<> &Builder,

+ Type *Ty,

+ SmallVectorImpl<Value *> &Values) {

+ if (Values.size() == 1)

+ return Values[0];

+ Value *NewVal = UndefValue::get(Ty);

+ for (int I = 0, E = Values.size(); I != E; ++I)

+ NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);

+ return NewVal;

+bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {

+ if (I.getOpcode() != Instruction::Mul)

+ return false;

+ Type *Ty = I.getType();

+ unsigned Size = Ty->getScalarSizeInBits();

+ if (Size <= 16 && ST->has16BitInsts())

+ return false;

+ // Prefer scalar if this could be s_mul_i32

+ if (DA->isUniform(&I))

+ return false;

+ Value *LHS = I.getOperand(0);

+ Value *RHS = I.getOperand(1);

+ IRBuilder<> Builder(&I);

+ Builder.SetCurrentDebugLocation(I.getDebugLoc());

+ Intrinsic::ID IntrID = Intrinsic::not_intrinsic;

+ // TODO: Should this try to match mulhi24?

+ if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {

+ IntrID = Intrinsic::amdgcn_mul_u24;

+ } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {

+ IntrID = Intrinsic::amdgcn_mul_i24;

+ } else

+ return false;

+ SmallVector<Value *, 4> LHSVals;

+ SmallVector<Value *, 4> RHSVals;

+ SmallVector<Value *, 4> ResultVals;

+ extractValues(Builder, LHSVals, LHS);

+ extractValues(Builder, RHSVals, RHS);

+ IntegerType *I32Ty = Builder.getInt32Ty();

+ FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);

+ for (int I = 0, E = LHSVals.size(); I != E; ++I) {

+ Value *LHS, *RHS;

+ if (IntrID == Intrinsic::amdgcn_mul_u24) {

+ LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);

+ RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);

+ } else {

+ LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);

+ RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);

+ }

+ Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});

+ if (IntrID == Intrinsic::amdgcn_mul_u24) {

+ ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,

+ LHSVals[I]->getType()));

+ } else {

+ ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,

+ LHSVals[I]->getType()));

+ }

+ I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));

+ I.eraseFromParent();

+ return true;

static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {

const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);

if (!CNum)

@@ -757,6 +879,9 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {

DA->isUniform(&I) && promoteUniformOpToI32(I))

return true;

+ if (replaceMulWithMul24(I))

+ return true;

bool Changed = false;

Instruction::BinaryOps Opc = I.getOpcode();

Type *Ty = I.getType();

@@ -807,7 +932,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {

Type *I32Ty = Builder.getInt32Ty();

Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());

Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);

- LoadInst *WidenLoad = Builder.CreateLoad(BitCast);

+ LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);

WidenLoad->copyMetadata(I);

// If we have range metadata, we need to convert the type, and not make

@@ -883,6 +1008,7 @@ bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {

Mod = &M;

+ DL = &Mod->getDataLayout();

return false;

}