14 files changed, 178 insertions, 222 deletions
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6530c40ea100..0acbace5f848 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
 FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
-BasicBlockPass *createNVPTXLowerAllocaPass();
+FunctionPass *createNVPTXLowerAllocaPass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5f38b4a3c4c5..307f4d58c3ab 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
 }
 
 unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
     DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
@@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
   return false;
 }
 
-void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
   AsmPrinter::EmitBasicBlockStart(MBB);
   if (isLoopHeaderOfNoUnroll(MBB))
     OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
@@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
 }
 
 void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
-  unsigned RegNo = MI->getOperand(0).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+  Register RegNo = MI->getOperand(0).getReg();
+  if (Register::isVirtualRegister(RegNo)) {
     OutStreamer->AddComment(Twine("implicit-def: ") +
                             getVirtualRegisterName(RegNo));
   } else {
@@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
 
   auto *FTy = dyn_cast<FunctionType>(Ty);
   if (FTy)
-    return DL.getPointerPrefAlignment();
+    return DL.getPointerPrefAlignment().value();
   return DL.getPrefTypeAlignment(Ty);
 }
 
@@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         // Just print .param .align <a> .b8 .param[size];
         // <a> = PAL.getparamalignment
         // size = typeallocsize of element type
-        unsigned align = PAL.getParamAlignment(paramIndex);
-        if (align == 0)
-          align = DL.getABITypeAlignment(Ty);
+        const Align align = DL.getValueOrABITypeAlignment(
+            PAL.getParamAlignment(paramIndex), Ty);
 
         unsigned sz = DL.getTypeAllocSize(Ty);
-        O << "\t.param .align " << align << " .b8 ";
+        O << "\t.param .align " << align.value() << " .b8 ";
         printParamName(I, paramIndex, O);
         O << "[" << sz << "]";
 
@@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // Just print .param .align <a> .b8 .param[size];
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
-      unsigned align = PAL.getParamAlignment(paramIndex);
-      if (align == 0)
-        align = DL.getABITypeAlignment(ETy);
+      Align align =
+          DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
       // Work around a bug in ptxas. When PTX code takes address of
       // byval parameter with alignment < 4, ptxas generates code to
       // spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       // TODO: this will need to be undone when we get to support multi-TU
       // device-side compilation as it breaks ABI compatibility with nvcc.
       // Hopefully ptxas bug is fixed by then.
-      if (!isKernelFunc && align < 4)
-        align = 4;
+      if (!isKernelFunc && align < Align(4))
+        align = Align(4);
       unsigned sz = DL.getTypeAllocSize(ETy);
-      O << "\t.param .align " << align << " .b8 ";
+      O << "\t.param .align " << align.value() << " .b8 ";
       printParamName(I, paramIndex, O);
       O << "[" << sz << "]";
       continue;
@@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   // We use the per class virtual register number in the ptx output.
   unsigned int numVRs = MRI->getNumVirtRegs();
   for (unsigned i = 0; i < numVRs; i++) {
-    unsigned int vr = TRI->index2VirtReg(i);
+    unsigned int vr = Register::index2VirtReg(i);
     const TargetRegisterClass *RC = MRI->getRegClass(vr);
     DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
     int n = regmap.size();
@@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::HalfTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID: {
-    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const auto *CFP = cast<ConstantFP>(CPV);
     Type *Ty = CFP->getType();
     if (Ty == Type::getHalfTy(CPV->getContext())) {
       APInt API = CFP->getValueAPF().bitcastToAPInt();
@@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    if (Register::isPhysicalRegister(MO.getReg())) {
       if (MO.getReg() == NVPTX::VRDepot)
         O << DEPOTNAME << getFunctionNumber();
       else
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 43ae57ac1262..7a66854d32f4 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -200,7 +200,7 @@ private:
   const Function *F;
   std::string CurrentFnName;
 
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+  void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
   void EmitFunctionEntryLabel() override;
   void EmitFunctionBodyStart() override;
   void EmitFunctionBodyEnd() override;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 46f08b23d31a..d26912f47e50 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 
 NVPTXFrameLowering::NVPTXFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {}
 
 bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ae1aa98da0e8..9acd0bea66fd 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // Register custom handling for vector loads/stores
-  for (MVT VT : MVT::vector_valuetypes()) {
+  for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     if (IsPTXVectorType(VT)) {
       setOperationAction(ISD::LOAD, VT, Custom);
       setOperationAction(ISD::STORE, VT, Custom);
@@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype(
       O << ".param .b" << size << " _";
     } else if (isa<PointerType>(retTy)) {
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
-    } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
-      auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
+    } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
+               retTy->isIntegerTy(128)) {
       O << ".param .align " << retAlignment << " .b8 _["
         << DL.getTypeAllocSize(retTy) << "]";
     } else {
@@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::v2f16) {
     LoadSDNode *Load = cast<LoadSDNode>(Op);
     EVT MemVT = Load->getMemoryVT();
-    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                            *Load->getMemOperand())) {
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        MemVT, *Load->getMemOperand())) {
       SDValue Ops[2];
       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
       return DAG.getMergeValues(Ops, SDLoc(Op));
@@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   // v2f16 is legal, so we can't rely on legalizer to handle unaligned
   // stores and have to handle it here.
   if (VT == MVT::v2f16 &&
-      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
-                          *Store->getMemOperand()))
+      !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      VT, *Store->getMemOperand()))
     return expandUnalignedStore(Store, DAG);
 
   if (VT.isVector())
@@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
   case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
@@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 4;
+    Info.align = Align(4);
     return true;
   }
 
@@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
 
@@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOStore;
-    Info.align = 8;
+    Info.align = Align(8);
     return true;
   }
 
@@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-    Info.align = 0;
+    Info.align.reset();
     return true;
   }
 
@@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Info.align =
+        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
 
     return true;
   }
@@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Info.align =
+        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
 
     return true;
   }
@@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_tex_1d_v4s32_s32:
@@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i8_clamp:
@@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i16_clamp:
@@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i32_clamp:
@@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
 
   case Intrinsic::nvvm_suld_1d_i64_clamp:
@@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = nullptr;
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align = 16;
+    Info.align = Align(16);
     return true;
   }
   return false;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 62da3c79f465..fe7a84f9a361 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
 def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
 def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
 def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
+def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
 
 def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
 def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
 def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
 def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
 
+// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
+def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" 
+                          "&& Subtarget->getPTXVersion() >= 64)">;
+
 def useShortPtr : Predicate<"useShortPointers()">;
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 
@@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 // ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
 // ptx value to 64 bits to match the ISD node's semantics, unless we know we're
 // truncating back down to 32 bits.
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
 def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 
 // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
@@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 // and then ctlz that value.  This way we don't have to subtract 16 from the
 // result.  Unfortunately today we don't have a way to generate
 // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(ctlz Int16Regs:$a),
+def : Pat<(i16 (ctlz Int16Regs:$a)),
           (SUBi16ri (CVT_u16_u32
            (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
           (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
@@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
 // If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop Int16Regs:$a),
           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
           (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
 
 // fpround f32 -> f16
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 1752d3e0575e..c52195fb0449 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> {
                             []);
 }
 
+class THREADMASK_INFO<bit sync> {
+  list<bit> ret = !if(sync, [0,1], [0]);
+}
+
 //-----------------------------------
 // Synchronization and shuffle functions
 //-----------------------------------
@@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
                  [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
         Requires<[hasPTX60, hasSM30]>;
 
-
-// shfl.{up,down,bfly,idx}.b32
-multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
-  // The last two parameters to shfl can be regs or imms.  ptxas is smart
-  // enough to inline constant registers, so strictly speaking we don't need to
-  // handle immediates here.  But it's easy enough, and it makes our ptx more
-  // readable.
-  def reg : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def imm1 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
-
-  def imm2 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
-
-  def imm3 : NVPTXInst<
-      (outs regclass:$dst),
-      (ins regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
-      [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
+                 bit offset_imm, bit mask_imm, bit threadmask_imm>
+      : NVPTXInst<(outs), (ins), "?", []> {
+  NVPTXRegClass rc = !cond(
+    !eq(reg, "i32"): Int32Regs,
+    !eq(reg, "f32"): Float32Regs);
+  string IntrName = "int_nvvm_shfl_"
+                    # !if(sync, "sync_", "")
+                    # mode
+                    # "_" # reg
+                    # !if(return_pred, "p", "");
+  Intrinsic Intr = !cast<Intrinsic>(IntrName);
+  let InOperandList = !con(
+    !if(sync,
+        !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
+        (ins)),
+    (ins rc:$src),
+    !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
+    !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
+    );
+  let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
+  let AsmString = "shfl."
+     # !if(sync, "sync.", "")
+     # mode # ".b32\t"
+     # "$dst"
+     # !if(return_pred, "|$pred", "") # ", "
+     # "$src, $offset, $mask"
+     # !if(sync, ", $threadmask", "")
+     # ";"
+     ;
+  let Pattern = [!con(
+      !foreach(tmp, OutOperandList,
+             !subst(outs, set,
+             !subst(i32imm, imm, tmp))),
+      (set !foreach(tmp, InOperandList,
+             !subst(ins, Intr,
+             !subst(i32imm, imm, tmp))))
+  )];
 }
 
-defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
-defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
-defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
-defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
-defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
-defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
-defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
-defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
-
-multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
-  // Threadmask and the last two parameters to shfl.sync can be regs or imms.
-  // ptxas is smart enough to inline constant registers, so strictly speaking we
-  // don't need to handle immediates here.  But it's easy enough, and it makes
-  // our ptx more readable.
-  def rrr : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def rri : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            Int32Regs:$offset, imm:$mask))]>;
-
-  def rir : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            imm:$offset, Int32Regs:$mask))]>;
-
-  def rii : NVPTXInst<
-      (outs regclass:$dst),
-      (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
-                            imm:$offset, imm:$mask))]>;
-
-  def irr : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            Int32Regs:$offset, Int32Regs:$mask))]>;
-
-  def iri : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            Int32Regs:$offset, imm:$mask))]>;
-
-  def iir : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            imm:$offset, Int32Regs:$mask))]>;
-
-  def iii : NVPTXInst<
-      (outs regclass:$dst),
-      (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
-      !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
-      [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
-                            imm:$offset, imm:$mask))]>;
+foreach sync = [0, 1] in {
+  foreach mode = ["up", "down", "bfly", "idx"] in {
+    foreach regclass = ["i32", "f32"] in {
+      foreach return_pred = [0, 1] in {
+        foreach offset_imm = [0, 1] in {
+          foreach mask_imm = [0, 1] in {
+            foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
+              def : SHFL_INSTR<sync, mode, regclass, return_pred,
+                               offset_imm, mask_imm, threadmask_imm>,
+                    Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
-// On sm_70 these don't have to be convergent, so we may eventually want to
-// implement non-convergent variant of this intrinsic.
-defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
-defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
-defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
-defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
-defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
-defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
-defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
-defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
-
-
 // vote.{all,any,uni,ballot}
 multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
   def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0743a2986718..83039241a7c7 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   // Do the transformation of an aggr load/copy/set to a loop
   //
   for (LoadInst *LI : AggrLoads) {
-    StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+    auto *SI = cast<StoreInst>(*LI->user_begin());
     Value *SrcAddr = LI->getOperand(0);
     Value *DstAddr = SI->getOperand(1);
     unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 76fb9f3fa692..945b7286b03c 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &);
 }
 
 namespace {
-class NVPTXLowerAlloca : public BasicBlockPass {
-  bool runOnBasicBlock(BasicBlock &BB) override;
+class NVPTXLowerAlloca : public FunctionPass {
+  bool runOnFunction(Function &F) override;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+  NVPTXLowerAlloca() : FunctionPass(ID) {}
   StringRef getPassName() const override {
     return "convert address space of alloca'ed memory to local";
   }
@@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
-  if (skipBasicBlock(BB))
+bool NVPTXLowerAlloca::runOnFunction(Function &F) {
+  if (skipFunction(F))
     return false;
 
   bool Changed = false;
-  for (auto &I : BB) {
-    if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
-      Changed = true;
-      auto PTy = dyn_cast<PointerType>(allocaInst->getType());
-      auto ETy = PTy->getElementType();
-      auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
-      auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
-      auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
-      auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
-                                                    GenericAddrTy, "");
-      NewASCToLocal->insertAfter(allocaInst);
-      NewASCToGeneric->insertAfter(NewASCToLocal);
-      for (Value::use_iterator UI = allocaInst->use_begin(),
-                                UE = allocaInst->use_end();
-            UI != UE; ) {
-        // Check Load, Store, GEP, and BitCast Uses on alloca and make them
-        // use the converted generic address, in order to expose non-generic
-        // addrspacecast to NVPTXInferAddressSpaces. For other types
-        // of instructions this is unnecessary and may introduce redundant
-        // address cast.
-        const auto &AllocaUse = *UI++;
-        auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
-        if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
-          LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
-        if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
-          SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
-        if (GI && GI->getPointerOperand() == allocaInst) {
-          GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
-        }
-        auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
-        if (BI && BI->getOperand(0) == allocaInst) {
-          BI->setOperand(0, NewASCToGeneric);
-          continue;
+  for (auto &BB : F)
+    for (auto &I : BB) {
+      if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+        Changed = true;
+        auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+        auto ETy = PTy->getElementType();
+        auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+        auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+        auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+        auto NewASCToGeneric =
+            new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, "");
+        NewASCToLocal->insertAfter(allocaInst);
+        NewASCToGeneric->insertAfter(NewASCToLocal);
+        for (Value::use_iterator UI = allocaInst->use_begin(),
+                                 UE = allocaInst->use_end();
+             UI != UE;) {
+          // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+          // use the converted generic address, in order to expose non-generic
+          // addrspacecast to NVPTXInferAddressSpaces. For other types
+          // of instructions this is unnecessary and may introduce redundant
+          // address cast.
+          const auto &AllocaUse = *UI++;
+          auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+          if (LI && LI->getPointerOperand() == allocaInst &&
+              !LI->isVolatile()) {
+            LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+          if (SI && SI->getPointerOperand() == allocaInst &&
+              !SI->isVolatile()) {
+            SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+          if (GI && GI->getPointerOperand() == allocaInst) {
+            GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+            continue;
+          }
+          auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+          if (BI && BI->getOperand(0) == allocaInst) {
+            BI->setOperand(0, NewASCToGeneric);
+            continue;
+          }
         }
       }
     }
-  }
   return Changed;
 }
 
-BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+FunctionPass *llvm::createNVPTXLowerAllocaPass() {
   return new NVPTXLowerAlloca();
 }
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c5e02e34e25e..c3c5f6fbcba7 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo()));
+  AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 629757db8707..5e6411c61eab 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
   auto &Op = Root.getOperand(1);
   const auto &MRI = MF.getRegInfo();
   MachineInstr *GenericAddrDef = nullptr;
-  if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+  if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) {
     GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
   }
 
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 4c5a9adf1f65..a7127b0e9a99 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign();
+    unsigned Align = MFI.getLocalFrameMaxAlign().value();
 
     // Adjust to alignment boundary.
     Offset = (Offset + Align - 1) / Align * Align;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 11b3fe2fa3d3..f58fb5717773 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                         CPU, FS, Options, Reloc::PIC_,
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
-      TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+      TLOF(std::make_unique<NVPTXTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 665eb1383253..43c2e9920403 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -19,10 +19,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Mutex.h"
 #include <algorithm>
 #include <cstring>
 #include <map>
+#include <mutex>
 #include <string>
 #include <vector>
 
@@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache;
 static sys::Mutex Lock;
 
 void clearAnnotationCache(const Module *Mod) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   annotationCache->erase(Mod);
 }
 
 static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   assert(md && "Invalid mdnode for annotation");
   assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
   // start index = 1, to skip the global variable key
@@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
 }
 
 static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
   if (!NMD)
     return;
@@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
 
 bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                            unsigned &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
 
 bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
                            std::vector<unsigned> &retval) {
-  MutexGuard Guard(Lock);
+  std::lock_guard<sys::Mutex> Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);