diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-12-20 19:53:05 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-12-20 19:53:05 +0000 |
commit | 0b57cec536236d46e3dba9bd041533462f33dbb7 (patch) | |
tree | 56229dbdbbf76d18580f72f789003db17246c8d9 /contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | |
parent | 718ef55ec7785aae63f98f8ca05dc07ed399c16d (diff) |
Notes
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp')
-rw-r--r-- | contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 866 |
1 files changed, 866 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp new file mode 100644 index 000000000000..1c0f836f07e6 --- /dev/null +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -0,0 +1,866 @@ +//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass lowers all occurrences of i1 values (with a vreg_1 register class) +// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA +// form and a wave-level control flow graph. +// +// Before this pass, values that are semantically i1 and are defined and used +// within the same basic block are already represented as lane masks in scalar +// registers. However, values that cross basic blocks are always transferred +// between basic blocks in vreg_1 virtual registers and are lowered by this +// pass. +// +// The only instructions that use or define vreg_1 virtual registers are COPY, +// PHI, and IMPLICIT_DEF. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachinePostDominators.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetMachine.h" + +#define DEBUG_TYPE "si-i1-copies" + +using namespace llvm; + +static unsigned createLaneMaskReg(MachineFunction &MF); +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB); + +namespace { + +class SILowerI1Copies : public MachineFunctionPass { +public: + static char ID; + +private: + bool IsWave32 = false; + MachineFunction *MF = nullptr; + MachineDominatorTree *DT = nullptr; + MachinePostDominatorTree *PDT = nullptr; + MachineRegisterInfo *MRI = nullptr; + const GCNSubtarget *ST = nullptr; + const SIInstrInfo *TII = nullptr; + + unsigned ExecReg; + unsigned MovOp; + unsigned AndOp; + unsigned OrOp; + unsigned XorOp; + unsigned AndN2Op; + unsigned OrN2Op; + + DenseSet<unsigned> ConstrainRegs; + +public: + SILowerI1Copies() : MachineFunctionPass(ID) { + initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "SI Lower i1 Copies"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineDominatorTree>(); + AU.addRequired<MachinePostDominatorTree>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + void lowerCopiesFromI1(); + void lowerPhis(); + void lowerCopiesToI1(); + bool isConstantLaneMask(unsigned Reg, bool &Val) const; + void buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, unsigned PrevReg, unsigned CurReg); + MachineBasicBlock::iterator + getSaluInsertionAtEnd(MachineBasicBlock &MBB) const; + + bool isVreg1(unsigned Reg) const { + return TargetRegisterInfo::isVirtualRegister(Reg) && + MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass; + } + + bool isLaneMaskReg(unsigned Reg) const { + return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) && + TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == + ST->getWavefrontSize(); + } +}; + +/// Helper class that determines the relationship between incoming values of a +/// phi in the control flow graph to determine where an incoming value can +/// simply be taken as a scalar lane mask as-is, and where it needs to be +/// merged with another, previously defined lane mask. +/// +/// The approach is as follows: +/// - Determine all basic blocks which, starting from the incoming blocks, +/// a wave may reach before entering the def block (the block containing the +/// phi). +/// - If an incoming block has no predecessors in this set, we can take the +/// incoming value as a scalar lane mask as-is. +/// -- A special case of this is when the def block has a self-loop. +/// - Otherwise, the incoming value needs to be merged with a previously +/// defined lane mask. +/// - If there is a path into the set of reachable blocks that does _not_ go +/// through an incoming block where we can take the scalar lane mask as-is, +/// we need to invent an available value for the SSAUpdater. Choices are +/// 0 and undef, with differing consequences for how to merge values etc. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class PhiIncomingAnalysis { + MachinePostDominatorTree &PDT; + + // For each reachable basic block, whether it is a source in the induced + // subgraph of the CFG. + DenseMap<MachineBasicBlock *, bool> ReachableMap; + SmallVector<MachineBasicBlock *, 4> ReachableOrdered; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> Predecessors; + +public: + PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {} + + /// Returns whether \p MBB is a source in the induced subgraph of reachable + /// blocks. + bool isSource(MachineBasicBlock &MBB) const { + return ReachableMap.find(&MBB)->second; + } + + ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; } + + void analyze(MachineBasicBlock &DefBlock, + ArrayRef<MachineBasicBlock *> IncomingBlocks) { + assert(Stack.empty()); + ReachableMap.clear(); + ReachableOrdered.clear(); + Predecessors.clear(); + + // Insert the def block first, so that it acts as an end point for the + // traversal. + ReachableMap.try_emplace(&DefBlock, false); + ReachableOrdered.push_back(&DefBlock); + + for (MachineBasicBlock *MBB : IncomingBlocks) { + if (MBB == &DefBlock) { + ReachableMap[&DefBlock] = true; // self-loop on DefBlock + continue; + } + + ReachableMap.try_emplace(MBB, false); + ReachableOrdered.push_back(MBB); + + // If this block has a divergent terminator and the def block is its + // post-dominator, the wave may first visit the other successors. + bool Divergent = false; + for (MachineInstr &MI : MBB->terminators()) { + if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO || + MI.getOpcode() == AMDGPU::SI_IF || + MI.getOpcode() == AMDGPU::SI_ELSE || + MI.getOpcode() == AMDGPU::SI_LOOP) { + Divergent = true; + break; + } + } + + if (Divergent && PDT.dominates(&DefBlock, MBB)) { + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + } + + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!ReachableMap.try_emplace(MBB, false).second) + continue; + ReachableOrdered.push_back(MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) + Stack.push_back(Succ); + } + + for (MachineBasicBlock *MBB : ReachableOrdered) { + bool HaveReachablePred = false; + for (MachineBasicBlock *Pred : MBB->predecessors()) { + if (ReachableMap.count(Pred)) { + HaveReachablePred = true; + } else { + Stack.push_back(Pred); + } + } + if (!HaveReachablePred) + ReachableMap[MBB] = true; + if (HaveReachablePred) { + for (MachineBasicBlock *UnreachablePred : Stack) { + if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end()) + Predecessors.push_back(UnreachablePred); + } + } + Stack.clear(); + } + } +}; + +/// Helper class that detects loops which require us to lower an i1 COPY into +/// bitwise manipulation. +/// +/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish +/// between loops with the same header. Consider this example: +/// +/// A-+-+ +/// | | | +/// B-+ | +/// | | +/// C---+ +/// +/// A is the header of a loop containing A, B, and C as far as LoopInfo is +/// concerned. However, an i1 COPY in B that is used in C must be lowered to +/// bitwise operations to combine results from different loop iterations when +/// B has a divergent branch (since by default we will compile this code such +/// that threads in a wave are merged at the entry of C). +/// +/// The following rule is implemented to determine whether bitwise operations +/// are required: use the bitwise lowering for a def in block B if a backward +/// edge to B is reachable without going through the nearest common +/// post-dominator of B and all uses of the def. +/// +/// TODO: This rule is conservative because it does not check whether the +/// relevant branches are actually divergent. +/// +/// The class is designed to cache the CFG traversal so that it can be re-used +/// for multiple defs within the same basic block. +/// +/// TODO: We could use region analysis to quickly skip over SESE regions during +/// the traversal. +/// +class LoopFinder { + MachineDominatorTree &DT; + MachinePostDominatorTree &PDT; + + // All visited / reachable block, tagged by level (level 0 is the def block, + // level 1 are all blocks reachable including but not going through the def + // block's IPDOM, etc.). + DenseMap<MachineBasicBlock *, unsigned> Visited; + + // Nearest common dominator of all visited blocks by level (level 0 is the + // def block). Used for seeding the SSAUpdater. + SmallVector<MachineBasicBlock *, 4> CommonDominators; + + // Post-dominator of all visited blocks. + MachineBasicBlock *VisitedPostDom = nullptr; + + // Level at which a loop was found: 0 is not possible; 1 = a backward edge is + // reachable without going through the IPDOM of the def block (if the IPDOM + // itself has an edge to the def block, the loop level is 2), etc. + unsigned FoundLoopLevel = ~0u; + + MachineBasicBlock *DefBlock = nullptr; + SmallVector<MachineBasicBlock *, 4> Stack; + SmallVector<MachineBasicBlock *, 4> NextLevel; + +public: + LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT) + : DT(DT), PDT(PDT) {} + + void initialize(MachineBasicBlock &MBB) { + Visited.clear(); + CommonDominators.clear(); + Stack.clear(); + NextLevel.clear(); + VisitedPostDom = nullptr; + FoundLoopLevel = ~0u; + + DefBlock = &MBB; + } + + /// Check whether a backward edge can be reached without going through the + /// given \p PostDom of the def block. + /// + /// Return the level of \p PostDom if a loop was found, or 0 otherwise. + unsigned findLoop(MachineBasicBlock *PostDom) { + MachineDomTreeNode *PDNode = PDT.getNode(DefBlock); + + if (!VisitedPostDom) + advanceLevel(); + + unsigned Level = 0; + while (PDNode->getBlock() != PostDom) { + if (PDNode->getBlock() == VisitedPostDom) + advanceLevel(); + PDNode = PDNode->getIDom(); + Level++; + if (FoundLoopLevel == Level) + return Level; + } + + return 0; + } + + /// Add undef values dominating the loop and the optionally given additional + /// blocks, so that the SSA updater doesn't have to search all the way to the + /// function entry. + void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater, + ArrayRef<MachineBasicBlock *> Blocks = {}) { + assert(LoopLevel < CommonDominators.size()); + + MachineBasicBlock *Dom = CommonDominators[LoopLevel]; + for (MachineBasicBlock *MBB : Blocks) + Dom = DT.findNearestCommonDominator(Dom, MBB); + + if (!inLoopLevel(*Dom, LoopLevel, Blocks)) { + SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom)); + } else { + // The dominator is part of the loop or the given blocks, so add the + // undef value to unreachable predecessors instead. + for (MachineBasicBlock *Pred : Dom->predecessors()) { + if (!inLoopLevel(*Pred, LoopLevel, Blocks)) + SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred)); + } + } + } + +private: + bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel, + ArrayRef<MachineBasicBlock *> Blocks) const { + auto DomIt = Visited.find(&MBB); + if (DomIt != Visited.end() && DomIt->second <= LoopLevel) + return true; + + if (llvm::find(Blocks, &MBB) != Blocks.end()) + return true; + + return false; + } + + void advanceLevel() { + MachineBasicBlock *VisitedDom; + + if (!VisitedPostDom) { + VisitedPostDom = DefBlock; + VisitedDom = DefBlock; + Stack.push_back(DefBlock); + } else { + VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock(); + VisitedDom = CommonDominators.back(); + + for (unsigned i = 0; i < NextLevel.size();) { + if (PDT.dominates(VisitedPostDom, NextLevel[i])) { + Stack.push_back(NextLevel[i]); + + NextLevel[i] = NextLevel.back(); + NextLevel.pop_back(); + } else { + i++; + } + } + } + + unsigned Level = CommonDominators.size(); + while (!Stack.empty()) { + MachineBasicBlock *MBB = Stack.pop_back_val(); + if (!PDT.dominates(VisitedPostDom, MBB)) + NextLevel.push_back(MBB); + + Visited[MBB] = Level; + VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == DefBlock) { + if (MBB == VisitedPostDom) + FoundLoopLevel = std::min(FoundLoopLevel, Level + 1); + else + FoundLoopLevel = std::min(FoundLoopLevel, Level); + continue; + } + + if (Visited.try_emplace(Succ, ~0u).second) { + if (MBB == VisitedPostDom) + NextLevel.push_back(Succ); + else + Stack.push_back(Succ); + } + } + } + + CommonDominators.push_back(VisitedDom); + } +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) +INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false, + false) + +char SILowerI1Copies::ID = 0; + +char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID; + +FunctionPass *llvm::createSILowerI1CopiesPass() { + return new SILowerI1Copies(); +} + +static unsigned createLaneMaskReg(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); +} + +static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) { + MachineFunction &MF = *MBB.getParent(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const SIInstrInfo *TII = ST.getInstrInfo(); + unsigned UndefReg = createLaneMaskReg(MF); + BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF), + UndefReg); + return UndefReg; +} + +/// Lower all instructions that def or use vreg_1 registers. +/// +/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can +/// occur around inline assembly. We do this first, before vreg_1 registers +/// are changed to scalar mask registers. +/// +/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before +/// all others, because phi lowering looks through copies and can therefore +/// often make copy lowering unnecessary. +bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) { + MF = &TheMF; + MRI = &MF->getRegInfo(); + DT = &getAnalysis<MachineDominatorTree>(); + PDT = &getAnalysis<MachinePostDominatorTree>(); + + ST = &MF->getSubtarget<GCNSubtarget>(); + TII = ST->getInstrInfo(); + IsWave32 = ST->isWave32(); + + if (IsWave32) { + ExecReg = AMDGPU::EXEC_LO; + MovOp = AMDGPU::S_MOV_B32; + AndOp = AMDGPU::S_AND_B32; + OrOp = AMDGPU::S_OR_B32; + XorOp = AMDGPU::S_XOR_B32; + AndN2Op = AMDGPU::S_ANDN2_B32; + OrN2Op = AMDGPU::S_ORN2_B32; + } else { + ExecReg = AMDGPU::EXEC; + MovOp = AMDGPU::S_MOV_B64; + AndOp = AMDGPU::S_AND_B64; + OrOp = AMDGPU::S_OR_B64; + XorOp = AMDGPU::S_XOR_B64; + AndN2Op = AMDGPU::S_ANDN2_B64; + OrN2Op = AMDGPU::S_ORN2_B64; + } + + lowerCopiesFromI1(); + lowerPhis(); + lowerCopiesToI1(); + + for (unsigned Reg : ConstrainRegs) + MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass); + ConstrainRegs.clear(); + + return true; +} + +void SILowerI1Copies::lowerCopiesFromI1() { + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::COPY) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!isVreg1(SrcReg)) + continue; + + if (isLaneMaskReg(DstReg) || isVreg1(DstReg)) + continue; + + // Copy into a 32-bit vector register. + LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI); + DebugLoc DL = MI.getDebugLoc(); + + assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32); + assert(!MI.getOperand(0).getSubReg()); + + ConstrainRegs.insert(SrcReg); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addImm(0) + .addImm(0) + .addImm(0) + .addImm(-1) + .addReg(SrcReg); + DeadCopies.push_back(&MI); + } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +void SILowerI1Copies::lowerPhis() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + PhiIncomingAnalysis PIA(*PDT); + SmallVector<MachineInstr *, 4> DeadPhis; + SmallVector<MachineBasicBlock *, 4> IncomingBlocks; + SmallVector<unsigned, 4> IncomingRegs; + SmallVector<unsigned, 4> IncomingUpdated; +#ifndef NDEBUG + DenseSet<unsigned> PhiRegisters; +#endif + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB.phis()) { + unsigned DstReg = MI.getOperand(0).getReg(); + if (!isVreg1(DstReg)) + continue; + + LLVM_DEBUG(dbgs() << "Lower PHI: " << MI); + + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); + + // Collect incoming values. + for (unsigned i = 1; i < MI.getNumOperands(); i += 2) { + assert(i + 1 < MI.getNumOperands()); + unsigned IncomingReg = MI.getOperand(i).getReg(); + MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB(); + MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg); + + if (IncomingDef->getOpcode() == AMDGPU::COPY) { + IncomingReg = IncomingDef->getOperand(1).getReg(); + assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg)); + assert(!IncomingDef->getOperand(1).getSubReg()); + } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) { + continue; + } else { + assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg)); + } + + IncomingBlocks.push_back(IncomingMBB); + IncomingRegs.push_back(IncomingReg); + } + +#ifndef NDEBUG + PhiRegisters.insert(DstReg); +#endif + + // Phis in a loop that are observed outside the loop receive a simple but + // conservatively correct treatment. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + + SSAUpdater.Initialize(DstReg); + + if (FoundLoopLevel) { + LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(IncomingBlocks[i], + IncomingUpdated.back()); + } + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } else { + // The phi is not observed from outside a loop. Use a more accurate + // lowering. + PIA.analyze(MBB, IncomingBlocks); + + for (MachineBasicBlock *MBB : PIA.predecessors()) + SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB)); + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + if (PIA.isSource(IMBB)) { + IncomingUpdated.push_back(0); + SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]); + } else { + IncomingUpdated.push_back(createLaneMaskReg(*MF)); + SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back()); + } + } + + for (unsigned i = 0; i < IncomingRegs.size(); ++i) { + if (!IncomingUpdated[i]) + continue; + + MachineBasicBlock &IMBB = *IncomingBlocks[i]; + buildMergeLaneMasks( + IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i], + SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]); + } + } + + unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB); + if (NewReg != DstReg) { + MRI->replaceRegWith(NewReg, DstReg); + + // Ensure that DstReg has a single def and mark the old PHI node for + // deletion. + MI.getOperand(0).setReg(NewReg); + DeadPhis.push_back(&MI); + } + + IncomingBlocks.clear(); + IncomingRegs.clear(); + IncomingUpdated.clear(); + } + + for (MachineInstr *MI : DeadPhis) + MI->eraseFromParent(); + DeadPhis.clear(); + } +} + +void SILowerI1Copies::lowerCopiesToI1() { + MachineSSAUpdater SSAUpdater(*MF); + LoopFinder LF(*DT, *PDT); + SmallVector<MachineInstr *, 4> DeadCopies; + + for (MachineBasicBlock &MBB : *MF) { + LF.initialize(MBB); + + for (MachineInstr &MI : MBB) { + if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF && + MI.getOpcode() != AMDGPU::COPY) + continue; + + unsigned DstReg = MI.getOperand(0).getReg(); + if (!isVreg1(DstReg)) + continue; + + if (MRI->use_empty(DstReg)) { + DeadCopies.push_back(&MI); + continue; + } + + LLVM_DEBUG(dbgs() << "Lower Other: " << MI); + + MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass + : &AMDGPU::SReg_64RegClass); + if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) + continue; + + DebugLoc DL = MI.getDebugLoc(); + unsigned SrcReg = MI.getOperand(1).getReg(); + assert(!MI.getOperand(1).getSubReg()); + + if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || + (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) { + assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32); + unsigned TmpReg = createLaneMaskReg(*MF); + BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg) + .addReg(SrcReg) + .addImm(0); + MI.getOperand(1).setReg(TmpReg); + SrcReg = TmpReg; + } + + // Defs in a loop that are observed outside the loop must be transformed + // into appropriate bit manipulation. + MachineBasicBlock *PostDomBound = &MBB; + for (MachineInstr &Use : MRI->use_instructions(DstReg)) { + PostDomBound = + PDT->findNearestCommonDominator(PostDomBound, Use.getParent()); + } + + unsigned FoundLoopLevel = LF.findLoop(PostDomBound); + if (FoundLoopLevel) { + SSAUpdater.Initialize(DstReg); + SSAUpdater.AddAvailableValue(&MBB, DstReg); + LF.addLoopEntries(FoundLoopLevel, SSAUpdater); + + buildMergeLaneMasks(MBB, MI, DL, DstReg, + SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg); + DeadCopies.push_back(&MI); + } + } + + for (MachineInstr *MI : DeadCopies) + MI->eraseFromParent(); + DeadCopies.clear(); + } +} + +bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const { + const MachineInstr *MI; + for (;;) { + MI = MRI->getUniqueVRegDef(Reg); + if (MI->getOpcode() != AMDGPU::COPY) + break; + + Reg = MI->getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return false; + if (!isLaneMaskReg(Reg)) + return false; + } + + if (MI->getOpcode() != MovOp) + return false; + + if (!MI->getOperand(1).isImm()) + return false; + + int64_t Imm = MI->getOperand(1).getImm(); + if (Imm == 0) { + Val = false; + return true; + } + if (Imm == -1) { + Val = true; + return true; + } + + return false; +} + +static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) { + Def = false; + Use = false; + + for (const MachineOperand &MO : MI.operands()) { + if (MO.isReg() && MO.getReg() == AMDGPU::SCC) { + if (MO.isUse()) + Use = true; + else + Def = true; + } + } +} + +/// Return a point at the end of the given \p MBB to insert SALU instructions +/// for lane mask calculation. Take terminators and SCC into account. +MachineBasicBlock::iterator +SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const { + auto InsertionPt = MBB.getFirstTerminator(); + bool TerminatorsUseSCC = false; + for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) { + bool DefsSCC; + instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC); + if (TerminatorsUseSCC || DefsSCC) + break; + } + + if (!TerminatorsUseSCC) + return InsertionPt; + + while (InsertionPt != MBB.begin()) { + InsertionPt--; + + bool DefSCC, UseSCC; + instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC); + if (DefSCC) + return InsertionPt; + } + + // We should have at least seen an IMPLICIT_DEF or COPY + llvm_unreachable("SCC used by terminator but no def in block"); +} + +void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + unsigned PrevReg, unsigned CurReg) { + bool PrevVal; + bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal); + bool CurVal; + bool CurConstant = isConstantLaneMask(CurReg, CurVal); + + if (PrevConstant && CurConstant) { + if (PrevVal == CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg); + } else if (CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg); + } else { + BuildMI(MBB, I, DL, TII->get(XorOp), DstReg) + .addReg(ExecReg) + .addImm(-1); + } + return; + } + + unsigned PrevMaskedReg = 0; + unsigned CurMaskedReg = 0; + if (!PrevConstant) { + if (CurConstant && CurVal) { + PrevMaskedReg = PrevReg; + } else { + PrevMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg) + .addReg(PrevReg) + .addReg(ExecReg); + } + } + if (!CurConstant) { + // TODO: check whether CurReg is already masked by EXEC + if (PrevConstant && PrevVal) { + CurMaskedReg = CurReg; + } else { + CurMaskedReg = createLaneMaskReg(*MF); + BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg) + .addReg(CurReg) + .addReg(ExecReg); + } + } + + if (PrevConstant && !PrevVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(CurMaskedReg); + } else if (CurConstant && !CurVal) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg) + .addReg(PrevMaskedReg); + } else if (PrevConstant && PrevVal) { + BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg) + .addReg(CurMaskedReg) + .addReg(ExecReg); + } else { + BuildMI(MBB, I, DL, TII->get(OrOp), DstReg) + .addReg(PrevMaskedReg) + .addReg(CurMaskedReg ? CurMaskedReg : ExecReg); + } +} |