aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/ARM/MVETailPredication.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/ARM/MVETailPredication.cpp
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/ARM/MVETailPredication.cpp')
-rw-r--r--llvm/lib/Target/ARM/MVETailPredication.cpp167
1 files changed, 100 insertions, 67 deletions
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 4db8ab17c49b..038c68739cdf 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -20,7 +20,14 @@
/// - A tail-predicated loop, with implicit predication.
/// - A loop containing multiple VCPT instructions, predicating multiple VPT
/// blocks of instructions operating on different vector types.
+///
+/// This pass inserts the inserts the VCTP intrinsic to represent the effect of
+/// tail predication. This will be picked up by the ARM Low-overhead loop pass,
+/// which performs the final transformation to a DLSTP or WLSTP tail-predicated
+/// loop.
+#include "ARM.h"
+#include "ARMSubtarget.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolution.h"
@@ -28,20 +35,19 @@
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Instructions.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsARM.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "ARM.h"
-#include "ARMSubtarget.h"
using namespace llvm;
#define DEBUG_TYPE "mve-tail-predication"
#define DESC "Transform predicated vector loops to use MVE tail predication"
-static cl::opt<bool>
+cl::opt<bool>
DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
cl::init(true),
cl::desc("Disable MVE Tail Predication"));
@@ -85,6 +91,12 @@ private:
/// Is the icmp that generates an i1 vector, based upon a loop counter
/// and a limit that is defined outside the loop.
bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+
+ /// Insert the intrinsic to represent the effect of tail predication.
+ void InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy,
+ Value *NumElements);
};
} // end namespace
@@ -123,7 +135,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// The MVE and LOB extensions are combined to enable tail-predication, but
// there's nothing preventing us from generating VCTP instructions for v8.1m.
if (!ST->hasMVEIntegerOps() || !ST->hasV8_1MMainlineOps()) {
- LLVM_DEBUG(dbgs() << "TP: Not a v8.1m.main+mve target.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not a v8.1m.main+mve target.\n");
return false;
}
@@ -148,7 +160,7 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
// Look for the hardware loop intrinsic that sets the iteration count.
IntrinsicInst *Setup = FindLoopIterations(Preheader);
- // The test.set iteration could live in the pre- preheader.
+ // The test.set iteration could live in the pre-preheader.
if (!Setup) {
if (!Preheader->getSinglePredecessor())
return false;
@@ -171,11 +183,9 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (!Decrement)
return false;
- LLVM_DEBUG(dbgs() << "TP: Running on Loop: " << *L
- << *Setup << "\n"
+ LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
- bool Changed = TryConvert(Setup->getArgOperand(0));
- return Changed;
+ return TryConvert(Setup->getArgOperand(0));
}
bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
@@ -208,7 +218,7 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
// The vector icmp
if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
m_Instruction(Shuffle))) ||
- Pred != ICmpInst::ICMP_ULE || !L->isLoopInvariant(Shuffle))
+ Pred != ICmpInst::ICMP_ULE)
return false;
// First find the stuff outside the loop which is setting up the limit
@@ -230,11 +240,11 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
return false;
- if (TripCount != NumElements)
+ if (TripCount != NumElements || !L->isLoopInvariant(BECount))
return false;
// Now back to searching inside the loop body...
- // Find the add with takes the index iv and adds a constant vector to it.
+ // Find the add with takes the index iv and adds a constant vector to it.
Instruction *BroadcastSplat = nullptr;
Constant *Const = nullptr;
if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
@@ -269,14 +279,14 @@ bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
if (!match(OnEntry, m_Zero()))
return false;
-
+
Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
Instruction *LHS = nullptr;
if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
return false;
-
+
return LHS == Phi;
}
@@ -298,8 +308,8 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
// TODO: Can we support vectors larger than 128-bits?
- unsigned MaxWidth = TTI->getRegisterBitWidth(true);
- if (Lanes * ElementWidth != MaxWidth || Lanes == MaxWidth)
+ unsigned MaxWidth = TTI->getRegisterBitWidth(true);
+ if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
} else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
@@ -399,19 +409,25 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
// tail predicated loop.
static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
SetVector<Instruction*> &MaybeDead, Loop *L) {
- if (BasicBlock *Exit = L->getUniqueExitBlock()) {
- for (auto &Pair : NewPredicates) {
- Instruction *OldPred = Pair.first;
- Instruction *NewPred = Pair.second;
-
- for (auto &I : *Exit) {
- if (I.isSameOperationAs(OldPred)) {
- Instruction *PredClone = NewPred->clone();
- PredClone->insertBefore(&I);
- I.replaceAllUsesWith(PredClone);
- MaybeDead.insert(&I);
- break;
- }
+ BasicBlock *Exit = L->getUniqueExitBlock();
+ if (!Exit) {
+ LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
+ return;
+ }
+
+ for (auto &Pair : NewPredicates) {
+ Instruction *OldPred = Pair.first;
+ Instruction *NewPred = Pair.second;
+
+ for (auto &I : *Exit) {
+ if (I.isSameOperationAs(OldPred)) {
+ Instruction *PredClone = NewPred->clone();
+ PredClone->insertBefore(&I);
+ I.replaceAllUsesWith(PredClone);
+ MaybeDead.insert(&I);
+ LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
+ dbgs() << "ARM TP: with: "; PredClone->dump());
+ break;
}
}
}
@@ -432,23 +448,69 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
Dead.insert(I);
}
- for (auto *I : Dead)
+ for (auto *I : Dead) {
+ LLVM_DEBUG(dbgs() << "ARM TP: removing dead insn: "; I->dump());
I->eraseFromParent();
+ }
for (auto I : L->blocks())
DeleteDeadPHIs(I);
}
+void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
+ DenseMap<Instruction*, Instruction*> &NewPredicates,
+ VectorType *VecTy, Value *NumElements) {
+ IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+ Module *M = L->getHeader()->getModule();
+ Type *Ty = IntegerType::get(M->getContext(), 32);
+
+ // Insert a phi to count the number of elements processed by the loop.
+ PHINode *Processed = Builder.CreatePHI(Ty, 2);
+ Processed->addIncoming(NumElements, L->getLoopPreheader());
+
+ // Insert the intrinsic to represent the effect of tail predication.
+ Builder.SetInsertPoint(cast<Instruction>(Predicate));
+ ConstantInt *Factor =
+ ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+
+ Intrinsic::ID VCTPID;
+ switch (VecTy->getNumElements()) {
+ default:
+ llvm_unreachable("unexpected number of lanes");
+ case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
+ case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
+ case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
+
+ // FIXME: vctp64 currently not supported because the predicate
+ // vector wants to be <2 x i1>, but v2i1 is not a legal MVE
+ // type, so problems happen at isel time.
+ // Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
+ // purposes, but takes a v4i1 instead of a v2i1.
+ }
+ Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
+ Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
+ Predicate->replaceAllUsesWith(TailPredicate);
+ NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+
+ // Add the incoming value to the new phi.
+ // TODO: This add likely already exists in the loop.
+ Value *Remaining = Builder.CreateSub(Processed, Factor);
+ Processed->addIncoming(Remaining, L->getLoopLatch());
+ LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
+ << *Processed << "\n"
+ << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
+}
+
bool MVETailPredication::TryConvert(Value *TripCount) {
- if (!IsPredicatedVectorLoop())
+ if (!IsPredicatedVectorLoop()) {
+ LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
return false;
+ }
- LLVM_DEBUG(dbgs() << "TP: Found predicated vector loop.\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
// Walk through the masked intrinsics and try to find whether the predicate
// operand is generated from an induction variable.
- Module *M = L->getHeader()->getModule();
- Type *Ty = IntegerType::get(M->getContext(), 32);
SetVector<Instruction*> Predicates;
DenseMap<Instruction*, Instruction*> NewPredicates;
@@ -465,43 +527,14 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
continue;
if (!isTailPredicate(Predicate, NumElements)) {
- LLVM_DEBUG(dbgs() << "TP: Not tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
continue;
}
- LLVM_DEBUG(dbgs() << "TP: Found tail predicate: " << *Predicate << "\n");
+ LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
Predicates.insert(Predicate);
- // Insert a phi to count the number of elements processed by the loop.
- IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
- PHINode *Processed = Builder.CreatePHI(Ty, 2);
- Processed->addIncoming(NumElements, L->getLoopPreheader());
-
- // Insert the intrinsic to represent the effect of tail predication.
- Builder.SetInsertPoint(cast<Instruction>(Predicate));
- ConstantInt *Factor =
- ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
- Intrinsic::ID VCTPID;
- switch (VecTy->getNumElements()) {
- default:
- llvm_unreachable("unexpected number of lanes");
- case 2: VCTPID = Intrinsic::arm_vctp64; break;
- case 4: VCTPID = Intrinsic::arm_vctp32; break;
- case 8: VCTPID = Intrinsic::arm_vctp16; break;
- case 16: VCTPID = Intrinsic::arm_vctp8; break;
- }
- Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
- Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
- Predicate->replaceAllUsesWith(TailPredicate);
- NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
-
- // Add the incoming value to the new phi.
- // TODO: This add likely already exists in the loop.
- Value *Remaining = Builder.CreateSub(Processed, Factor);
- Processed->addIncoming(Remaining, L->getLoopLatch());
- LLVM_DEBUG(dbgs() << "TP: Insert processed elements phi: "
- << *Processed << "\n"
- << "TP: Inserted VCTP: " << *TailPredicate << "\n");
+ InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
}
// Now clean up.