diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 336 |
1 files changed, 336 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp new file mode 100644 index 000000000000..e5fbcca1e7d1 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -0,0 +1,336 @@ +//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa +// or dim=2darraymsaa into a single image_msaa_load intrinsic if: +// +// - they refer to the same vaddr except for sample_id, +// - they use a constant sample_id and they fall into the same group, +// - they have the same dmask and the number of intrinsics and the number of +// vaddr/vdata dword transfers is reduced by the combine. +// +// Examples for the tradeoff (all are assuming 2DMsaa for vaddr): +// +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? | +// | (dmask) | | | | vdata | | vdata | | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes | +// +----------+-----+-----+-------+---------+------------+---------+----------+ +// +// Some cases are of questionable benefit, like the one marked with "yes?" +// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP +// and TX, but higher vdata. We start by erring on the side of converting these +// to MSAA_LOAD. +// +// clang-format off +// +// This pass will combine intrinsics such as (not neccessarily consecutive): +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0) +// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0) +// ==> +// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0) +// +// clang-format on +// +// Future improvements: +// +// - We may occasionally not want to do the combine if it increases the maximum +// register pressure. +// +// - Ensure clausing when multiple MSAA_LOAD are generated. +// +// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this +// combine only applies to gfx11, due to a limitation in gfx10: the gfx10 +// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and +// we don't know the format at compile time. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-image-intrinsic-opt" + +namespace { +class AMDGPUImageIntrinsicOptimizer : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; + + AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr) + : FunctionPass(ID), TM(TM) {} + + bool runOnFunction(Function &F) override; + +}; // End of class AMDGPUImageIntrinsicOptimizer +} // End anonymous namespace + +INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, + "AMDGPU Image Intrinsic Optimizer", false, false) + +char AMDGPUImageIntrinsicOptimizer::ID = 0; + +void addInstToMergeableList( + IntrinsicInst *II, + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) { + for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) { + // Check Dim. + if (IIList.front()->getIntrinsicID() != II->getIntrinsicID()) + continue; + + // Check D16. + if (IIList.front()->getType() != II->getType()) + continue; + + // Check all arguments (DMask, VAddr, RSrc etc). + bool AllEqual = true; + assert(IIList.front()->arg_size() == II->arg_size()); + for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) { + Value *ArgList = IIList.front()->getArgOperand(I); + Value *Arg = II->getArgOperand(I); + if (I == ImageDimIntr->VAddrEnd - 1) { + // Check FragId group. + auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I)); + auto FragId = cast<ConstantInt>(II->getArgOperand(I)); + AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4); + } else { + // Check all arguments except FragId. + AllEqual = ArgList == Arg; + } + } + if (!AllEqual) + continue; + + // Add to the list. + IIList.emplace_back(II); + return; + } + + // Similar instruction not found, so add a new list. + MergeableInsts.emplace_back(1, II); + LLVM_DEBUG(dbgs() << "New: " << *II << "\n"); +} + +// Collect list of all instructions we know how to merge in a subset of the +// block. It returns an iterator to the instruction after the last one analyzed. +BasicBlock::iterator collectMergeableInsts( + BasicBlock::iterator I, BasicBlock::iterator E, + SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) { + for (; I != E; ++I) { + // Don't combine if there is a store in the middle or if there is a memory + // barrier. + if (I->mayHaveSideEffects()) { + ++I; + break; + } + + // Ignore non-intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + Intrinsic::ID IntrinID = II->getIntrinsicID(); + + // Ignore other intrinsics. + if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa && + IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa) + continue; + + // Check for constant FragId. + const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID); + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; + if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex))) + continue; + + LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n"); + addInstToMergeableList(II, MergeableInsts, ImageDimIntr); + } + } + + return I; +} + +bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) { + bool Modified = false; + + SmallVector<Instruction *, 4> InstrsToErase; + for (const auto &IIList : MergeableInsts) { + if (IIList.size() <= 1) + continue; + + // Assume the arguments are unchanged and later override them, if needed. + SmallVector<Value *, 16> Args(IIList.front()->args()); + + // Validate function argument and return types, extracting overloaded + // types along the way. + SmallVector<Type *, 6> OverloadTys; + Function *F = IIList.front()->getCalledFunction(); + if (!Intrinsic::getIntrinsicSignature(F, OverloadTys)) + continue; + + Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID(); + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrinID); + + Type *EltTy = IIList.front()->getType()->getScalarType(); + Type *NewTy = FixedVectorType::get(EltTy, 4); + OverloadTys[0] = NewTy; + bool isD16 = EltTy->isHalfTy(); + + ConstantInt *DMask = cast<ConstantInt>( + IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex)); + unsigned DMaskVal = DMask->getZExtValue() & 0xf; + unsigned NumElts = popcount(DMaskVal); + + // Number of instructions and the number of vaddr/vdata dword transfers + // should be reduced. + unsigned NumLoads = IIList.size(); + unsigned NumMsaas = NumElts; + unsigned NumVAddrLoads = 3 * NumLoads; + unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads; + unsigned NumVAddrMsaas = 3 * NumMsaas; + unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas; + + if (NumLoads < NumMsaas || + (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas)) + continue; + + const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1; + auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex)); + const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4; + + // Create the new instructions. + IRBuilder<> B(IIList.front()); + + // Create the new image_msaa_load intrinsic. + SmallVector<Instruction *, 4> NewCalls; + while (DMaskVal != 0) { + unsigned NewMaskVal = 1 << countr_zero(DMaskVal); + + Intrinsic::ID NewIntrinID; + if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa) + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa; + else + NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; + + Function *NewIntrin = Intrinsic::getDeclaration( + IIList.front()->getModule(), NewIntrinID, OverloadTys); + Args[ImageDimIntr->DMaskIndex] = + ConstantInt::get(DMask->getType(), NewMaskVal); + Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal); + CallInst *NewCall = B.CreateCall(NewIntrin, Args); + LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n"); + + NewCalls.push_back(NewCall); + DMaskVal -= NewMaskVal; + } + + // Create the new extractelement instructions. + for (auto &II : IIList) { + Value *VecOp = nullptr; + auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex)); + B.SetCurrentDebugLocation(II->getDebugLoc()); + if (NumElts == 1) { + VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4)); + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); + } else { + VecOp = UndefValue::get(II->getType()); + for (unsigned I = 0; I < NumElts; ++I) { + VecOp = B.CreateInsertElement( + VecOp, + B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I); + LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n"); + } + } + + // Replace the old instruction. + II->replaceAllUsesWith(VecOp); + VecOp->takeName(II); + InstrsToErase.push_back(II); + } + + Modified = true; + } + + for (auto I : InstrsToErase) + I->eraseFromParent(); + + return Modified; +} + +static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) { + if (!TM) + return false; + + // This optimization only applies to GFX11 and beyond. + const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug()) + return false; + + Module *M = F.getParent(); + + // Early test to determine if the intrinsics are used. + if (std::none_of(M->begin(), M->end(), [](Function &F) { + return !F.users().empty() && + (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa || + F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa); + })) + return false; + + bool Modified = false; + for (auto &BB : F) { + BasicBlock::iterator SectionEnd; + for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; + I = SectionEnd) { + SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts; + + SectionEnd = collectMergeableInsts(I, E, MergeableInsts); + Modified |= optimizeSection(MergeableInsts); + } + } + + return Modified; +} + +bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + return imageIntrinsicOptimizerImpl(F, TM); +} + +FunctionPass * +llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) { + return new AMDGPUImageIntrinsicOptimizer(TM); +} + +PreservedAnalyses +AMDGPUImageIntrinsicOptimizerPass::run(Function &F, + FunctionAnalysisManager &AM) { + + bool Changed = imageIntrinsicOptimizerImpl(F, &TM); + return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} |
