diff options
Diffstat (limited to 'lib/Target/NVPTX')
-rw-r--r-- | lib/Target/NVPTX/NVPTX.h | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 34 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXAsmPrinter.h | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXFrameLowering.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelLowering.cpp | 58 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.td | 13 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXIntrinsics.td | 169 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXLowerAlloca.cpp | 97 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXPeephole.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXTargetMachine.cpp | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXUtilities.cpp | 13 |
14 files changed, 178 insertions, 222 deletions
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 6530c40ea100..0acbace5f848 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); FunctionPass *createNVPTXImageOptimizerPass(); FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM); -BasicBlockPass *createNVPTXLowerAllocaPass(); +FunctionPass *createNVPTXLowerAllocaPass(); MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 5f38b4a3c4c5..307f4d58c3ab 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO, } unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) { - if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (Register::isVirtualRegister(Reg)) { const TargetRegisterClass *RC = MRI->getRegClass(Reg); DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC]; @@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll( return false; } -void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const { +void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) { AsmPrinter::EmitBasicBlockStart(MBB); if (isLoopHeaderOfNoUnroll(MBB)) OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n")); @@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const { } void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const { - unsigned RegNo = MI->getOperand(0).getReg(); - if (TargetRegisterInfo::isVirtualRegister(RegNo)) { + Register RegNo = MI->getOperand(0).getReg(); + if (Register::isVirtualRegister(RegNo)) { OutStreamer->AddComment(Twine("implicit-def: ") + getVirtualRegisterName(RegNo)); } else { @@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) { auto *FTy = dyn_cast<FunctionType>(Ty); if (FTy) - return DL.getPointerPrefAlignment(); + return DL.getPointerPrefAlignment().value(); return DL.getPrefTypeAlignment(Ty); } @@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type - unsigned align = PAL.getParamAlignment(paramIndex); - if (align == 0) - align = DL.getABITypeAlignment(Ty); + const Align align = DL.getValueOrABITypeAlignment( + PAL.getParamAlignment(paramIndex), Ty); unsigned sz = DL.getTypeAllocSize(Ty); - O << "\t.param .align " << align << " .b8 "; + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; @@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // Just print .param .align <a> .b8 .param[size]; // <a> = PAL.getparamalignment // size = typeallocsize of element type - unsigned align = PAL.getParamAlignment(paramIndex); - if (align == 0) - align = DL.getABITypeAlignment(ETy); + Align align = + DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy); // Work around a bug in ptxas. When PTX code takes address of // byval parameter with alignment < 4, ptxas generates code to // spill argument into memory. Alas on sm_50+ ptxas generates @@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { // TODO: this will need to be undone when we get to support multi-TU // device-side compilation as it breaks ABI compatibility with nvcc. // Hopefully ptxas bug is fixed by then. - if (!isKernelFunc && align < 4) - align = 4; + if (!isKernelFunc && align < Align(4)) + align = Align(4); unsigned sz = DL.getTypeAllocSize(ETy); - O << "\t.param .align " << align << " .b8 "; + O << "\t.param .align " << align.value() << " .b8 "; printParamName(I, paramIndex, O); O << "[" << sz << "]"; continue; @@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters( // We use the per class virtual register number in the ptx output. unsigned int numVRs = MRI->getNumVirtRegs(); for (unsigned i = 0; i < numVRs; i++) { - unsigned int vr = TRI->index2VirtReg(i); + unsigned int vr = Register::index2VirtReg(i); const TargetRegisterClass *RC = MRI->getRegClass(vr); DenseMap<unsigned, unsigned> ®map = VRegMapping[RC]; int n = regmap.size(); @@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, case Type::HalfTyID: case Type::FloatTyID: case Type::DoubleTyID: { - const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); + const auto *CFP = cast<ConstantFP>(CPV); Type *Ty = CFP->getType(); if (Ty == Type::getHalfTy(CPV->getContext())) { APInt API = CFP->getValueAPF().bitcastToAPInt(); @@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum, const MachineOperand &MO = MI->getOperand(opNum); switch (MO.getType()) { case MachineOperand::MO_Register: - if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + if (Register::isPhysicalRegister(MO.getReg())) { if (MO.getReg() == NVPTX::VRDepot) O << DEPOTNAME << getFunctionNumber(); else diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 43ae57ac1262..7a66854d32f4 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -200,7 +200,7 @@ private: const Function *F; std::string CurrentFnName; - void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override; + void EmitBasicBlockStart(const MachineBasicBlock &MBB) override; void EmitFunctionEntryLabel() override; void EmitFunctionBodyStart() override; void EmitFunctionBodyEnd() override; diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp index 46f08b23d31a..d26912f47e50 100644 --- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp +++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp @@ -25,7 +25,7 @@ using namespace llvm; NVPTXFrameLowering::NVPTXFrameLowering() - : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {} + : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {} bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; } diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index ae1aa98da0e8..9acd0bea66fd 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, setOperationAction(ISD::TRAP, MVT::Other, Legal); // Register custom handling for vector loads/stores - for (MVT VT : MVT::vector_valuetypes()) { + for (MVT VT : MVT::fixedlen_vector_valuetypes()) { if (IsPTXVectorType(VT)) { setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); @@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype( O << ".param .b" << size << " _"; } else if (isa<PointerType>(retTy)) { O << ".param .b" << PtrVT.getSizeInBits() << " _"; - } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) { - auto &DL = CS.getCalledFunction()->getParent()->getDataLayout(); + } else if (retTy->isAggregateType() || retTy->isVectorTy() || + retTy->isIntegerTy(128)) { O << ".param .align " << retAlignment << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]"; } else { @@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType() == MVT::v2f16) { LoadSDNode *Load = cast<LoadSDNode>(Op); EVT MemVT = Load->getMemoryVT(); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *Load->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, SDLoc(Op)); @@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // v2f16 is legal, so we can't rely on legalizer to handle unaligned // stores and have to handle it here. if (VT == MVT::v2f16 && - !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *Store->getMemOperand())) + !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) return expandUnalignedStore(Store, DAG); if (VT.isVector()) @@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col: @@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 4; + Info.align = Align(4); return true; } @@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 16; + Info.align = Align(16); return true; } @@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOStore; - Info.align = 8; + Info.align = Align(8); return true; } @@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; - Info.align = 0; + Info.align.reset(); return true; } @@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + Info.align = + MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); return true; } @@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue(); + Info.align = + MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue()); return true; } @@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_tex_1d_v4s32_s32: @@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i8_clamp: @@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i16_clamp: @@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i32_clamp: @@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; case Intrinsic::nvvm_suld_1d_i64_clamp: @@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.ptrVal = nullptr; Info.offset = 0; Info.flags = MachineMemOperand::MOLoad; - Info.align = 16; + Info.align = Align(16); return true; } return false; diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 62da3c79f465..fe7a84f9a361 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">; def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">; def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">; def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">; +def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">; def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">; def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">; def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">; def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">; +// non-sync shfl instructions are not available on sm_70+ in PTX6.4+ +def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70" + "&& Subtarget->getPTXVersion() >= 64)">; + def useShortPtr : Predicate<"useShortPointers()">; def useFP16Math: Predicate<"Subtarget->allowFP16Math()">; @@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>; // ctz instruction always returns a 32-bit value. For ctlz.i64, convert the // ptx value to 64 bits to match the ISD node's semantics, unless we know we're // truncating back down to 32 bits. -def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>; def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the @@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>; // and then ctlz that value. This way we don't have to subtract 16 from the // result. Unfortunately today we don't have a way to generate // "mov b32reg, {b16imm, b16reg}", so we don't do this optimization. -def : Pat<(ctlz Int16Regs:$a), +def : Pat<(i16 (ctlz Int16Regs:$a)), (SUBi16ri (CVT_u16_u32 (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>; -def : Pat<(i32 (zext (ctlz Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))), (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>; // Population count @@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; // If we know that we're storing into an i32, we can avoid the final trunc. def : Pat<(ctpop Int16Regs:$a), (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; -def : Pat<(i32 (zext (ctpop Int16Regs:$a))), +def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))), (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; // fpround f32 -> f16 diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 1752d3e0575e..c52195fb0449 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> { []); } +class THREADMASK_INFO<bit sync> { + list<bit> ret = !if(sync, [0,1], [0]); +} + //----------------------------------- // Synchronization and shuffle functions //----------------------------------- @@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt), [(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>, Requires<[hasPTX60, hasSM30]>; - -// shfl.{up,down,bfly,idx}.b32 -multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - // The last two parameters to shfl can be regs or imms. ptxas is smart - // enough to inline constant registers, so strictly speaking we don't need to - // handle immediates here. But it's easy enough, and it makes our ptx more - // readable. - def reg : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>; - - def imm1 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>; - - def imm2 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>; - - def imm3 : NVPTXInst< - (outs regclass:$dst), - (ins regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"), - [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>; +class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred, + bit offset_imm, bit mask_imm, bit threadmask_imm> + : NVPTXInst<(outs), (ins), "?", []> { + NVPTXRegClass rc = !cond( + !eq(reg, "i32"): Int32Regs, + !eq(reg, "f32"): Float32Regs); + string IntrName = "int_nvvm_shfl_" + # !if(sync, "sync_", "") + # mode + # "_" # reg + # !if(return_pred, "p", ""); + Intrinsic Intr = !cast<Intrinsic>(IntrName); + let InOperandList = !con( + !if(sync, + !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]), + (ins)), + (ins rc:$src), + !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]), + !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"]) + ); + let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst)); + let AsmString = "shfl." + # !if(sync, "sync.", "") + # mode # ".b32\t" + # "$dst" + # !if(return_pred, "|$pred", "") # ", " + # "$src, $offset, $mask" + # !if(sync, ", $threadmask", "") + # ";" + ; + let Pattern = [!con( + !foreach(tmp, OutOperandList, + !subst(outs, set, + !subst(i32imm, imm, tmp))), + (set !foreach(tmp, InOperandList, + !subst(ins, Intr, + !subst(i32imm, imm, tmp)))) + )]; } -defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>; -defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>; -defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>; -defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>; -defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>; -defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>; -defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>; -defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>; - -multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { - // Threadmask and the last two parameters to shfl.sync can be regs or imms. - // ptxas is smart enough to inline constant registers, so strictly speaking we - // don't need to handle immediates here. But it's easy enough, and it makes - // our ptx more readable. - def rrr : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def rri : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def rir : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def rii : NVPTXInst< - (outs regclass:$dst), - (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; - - def irr : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, Int32Regs:$mask))]>; - - def iri : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - Int32Regs:$offset, imm:$mask))]>; - - def iir : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, Int32Regs:$mask))]>; - - def iii : NVPTXInst< - (outs regclass:$dst), - (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask), - !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"), - [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src, - imm:$offset, imm:$mask))]>; +foreach sync = [0, 1] in { + foreach mode = ["up", "down", "bfly", "idx"] in { + foreach regclass = ["i32", "f32"] in { + foreach return_pred = [0, 1] in { + foreach offset_imm = [0, 1] in { + foreach mask_imm = [0, 1] in { + foreach threadmask_imm = THREADMASK_INFO<sync>.ret in { + def : SHFL_INSTR<sync, mode, regclass, return_pred, + offset_imm, mask_imm, threadmask_imm>, + Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>; + } + } + } + } + } + } } -// On sm_70 these don't have to be convergent, so we may eventually want to -// implement non-convergent variant of this intrinsic. -defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>; -defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>; -defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>; -defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>; -defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>; -defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>; -defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>; -defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>; - - // vote.{all,any,uni,ballot} multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> { def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred), diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 0743a2986718..83039241a7c7 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // Do the transformation of an aggr load/copy/set to a loop // for (LoadInst *LI : AggrLoads) { - StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin()); + auto *SI = cast<StoreInst>(*LI->user_begin()); Value *SrcAddr = LI->getOperand(0); Value *DstAddr = SI->getOperand(1); unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp index 76fb9f3fa692..945b7286b03c 100644 --- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp @@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &); } namespace { -class NVPTXLowerAlloca : public BasicBlockPass { - bool runOnBasicBlock(BasicBlock &BB) override; +class NVPTXLowerAlloca : public FunctionPass { + bool runOnFunction(Function &F) override; public: static char ID; // Pass identification, replacement for typeid - NVPTXLowerAlloca() : BasicBlockPass(ID) {} + NVPTXLowerAlloca() : FunctionPass(ID) {} StringRef getPassName() const override { return "convert address space of alloca'ed memory to local"; } @@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca", // ============================================================================= // Main function for this pass. // ============================================================================= -bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) { - if (skipBasicBlock(BB)) +bool NVPTXLowerAlloca::runOnFunction(Function &F) { + if (skipFunction(F)) return false; bool Changed = false; - for (auto &I : BB) { - if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { - Changed = true; - auto PTy = dyn_cast<PointerType>(allocaInst->getType()); - auto ETy = PTy->getElementType(); - auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); - auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); - auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); - auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal, - GenericAddrTy, ""); - NewASCToLocal->insertAfter(allocaInst); - NewASCToGeneric->insertAfter(NewASCToLocal); - for (Value::use_iterator UI = allocaInst->use_begin(), - UE = allocaInst->use_end(); - UI != UE; ) { - // Check Load, Store, GEP, and BitCast Uses on alloca and make them - // use the converted generic address, in order to expose non-generic - // addrspacecast to NVPTXInferAddressSpaces. For other types - // of instructions this is unnecessary and may introduce redundant - // address cast. - const auto &AllocaUse = *UI++; - auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); - if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) { - LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto SI = dyn_cast<StoreInst>(AllocaUse.getUser()); - if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) { - SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser()); - if (GI && GI->getPointerOperand() == allocaInst) { - GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); - continue; - } - auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser()); - if (BI && BI->getOperand(0) == allocaInst) { - BI->setOperand(0, NewASCToGeneric); - continue; + for (auto &BB : F) + for (auto &I : BB) { + if (auto allocaInst = dyn_cast<AllocaInst>(&I)) { + Changed = true; + auto PTy = dyn_cast<PointerType>(allocaInst->getType()); + auto ETy = PTy->getElementType(); + auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL); + auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, ""); + auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC); + auto NewASCToGeneric = + new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, ""); + NewASCToLocal->insertAfter(allocaInst); + NewASCToGeneric->insertAfter(NewASCToLocal); + for (Value::use_iterator UI = allocaInst->use_begin(), + UE = allocaInst->use_end(); + UI != UE;) { + // Check Load, Store, GEP, and BitCast Uses on alloca and make them + // use the converted generic address, in order to expose non-generic + // addrspacecast to NVPTXInferAddressSpaces. For other types + // of instructions this is unnecessary and may introduce redundant + // address cast. + const auto &AllocaUse = *UI++; + auto LI = dyn_cast<LoadInst>(AllocaUse.getUser()); + if (LI && LI->getPointerOperand() == allocaInst && + !LI->isVolatile()) { + LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto SI = dyn_cast<StoreInst>(AllocaUse.getUser()); + if (SI && SI->getPointerOperand() == allocaInst && + !SI->isVolatile()) { + SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser()); + if (GI && GI->getPointerOperand() == allocaInst) { + GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric); + continue; + } + auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser()); + if (BI && BI->getOperand(0) == allocaInst) { + BI->setOperand(0, NewASCToGeneric); + continue; + } } } } - } return Changed; } -BasicBlockPass *llvm::createNVPTXLowerAllocaPass() { +FunctionPass *llvm::createNVPTXLowerAllocaPass() { return new NVPTXLowerAlloca(); } diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp index c5e02e34e25e..c3c5f6fbcba7 100644 --- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) { // Set the alignment to alignment of the byval parameter. This is because, // later load/stores assume that alignment, and we are going to replace // the use of the byval parameter with this alloca instruction. - AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo())); + AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo()))); Arg->replaceAllUsesWith(AllocA); Value *ArgInParam = new AddrSpaceCastInst( diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp index 629757db8707..5e6411c61eab 100644 --- a/lib/Target/NVPTX/NVPTXPeephole.cpp +++ b/lib/Target/NVPTX/NVPTXPeephole.cpp @@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) { auto &Op = Root.getOperand(1); const auto &MRI = MF.getRegInfo(); MachineInstr *GenericAddrDef = nullptr; - if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) { + if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) { GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg()); } diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp index 4c5a9adf1f65..a7127b0e9a99 100644 --- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp +++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp @@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) { // frame index registers. Functions which don't want/need this optimization // will continue to use the existing code path. if (MFI.getUseLocalStackAllocationBlock()) { - unsigned Align = MFI.getLocalFrameMaxAlign(); + unsigned Align = MFI.getLocalFrameMaxAlign().value(); // Adjust to alignment boundary. Offset = (Offset + Align - 1) / Align * Align; diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 11b3fe2fa3d3..f58fb5717773 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT, CPU, FS, Options, Reloc::PIC_, getEffectiveCodeModel(CM, CodeModel::Small), OL), is64bit(is64bit), UseShortPointers(UseShortPointersOpt), - TLOF(llvm::make_unique<NVPTXTargetObjectFile>()), + TLOF(std::make_unique<NVPTXTargetObjectFile>()), Subtarget(TT, CPU, FS, *this) { if (TT.getOS() == Triple::NVCL) drvInterface = NVPTX::NVCL; diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp index 665eb1383253..43c2e9920403 100644 --- a/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -19,10 +19,11 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" #include "llvm/Support/ManagedStatic.h" -#include "llvm/Support/MutexGuard.h" +#include "llvm/Support/Mutex.h" #include <algorithm> #include <cstring> #include <map> +#include <mutex> #include <string> #include <vector> @@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache; static sys::Mutex Lock; void clearAnnotationCache(const Module *Mod) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); annotationCache->erase(Mod); } static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); assert(md && "Invalid mdnode for annotation"); assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands"); // start index = 1, to skip the global variable key @@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) { } static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations"); if (!NMD) return; @@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) { bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, unsigned &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); @@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { - MutexGuard Guard(Lock); + std::lock_guard<sys::Mutex> Guard(Lock); const Module *m = gv->getParent(); if ((*annotationCache).find(m) == (*annotationCache).end()) cacheAnnotationFromMD(m, gv); |