aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/NVPTX
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/NVPTX')
-rw-r--r--lib/Target/NVPTX/NVPTX.h2
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp34
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h2
-rw-r--r--lib/Target/NVPTX/NVPTXFrameLowering.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXISelLowering.cpp58
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td13
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td169
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXLowerAlloca.cpp97
-rw-r--r--lib/Target/NVPTX/NVPTXLowerArgs.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXPeephole.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp2
-rw-r--r--lib/Target/NVPTX/NVPTXUtilities.cpp13
14 files changed, 178 insertions, 222 deletions
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6530c40ea100..0acbace5f848 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -44,7 +44,7 @@ MachineFunctionPass *createNVPTXPrologEpilogPass();
MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
FunctionPass *createNVPTXImageOptimizerPass();
FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
-BasicBlockPass *createNVPTXLowerAllocaPass();
+FunctionPass *createNVPTXLowerAllocaPass();
MachineFunctionPass *createNVPTXPeephole();
MachineFunctionPass *createNVPTXProxyRegErasurePass();
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 5f38b4a3c4c5..307f4d58c3ab 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -282,7 +282,7 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
}
unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
@@ -434,7 +434,7 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
return false;
}
-void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
AsmPrinter::EmitBasicBlockStart(MBB);
if (isLoopHeaderOfNoUnroll(MBB))
OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
@@ -507,8 +507,8 @@ const MCSymbol *NVPTXAsmPrinter::getFunctionFrameSymbol() const {
}
void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
- unsigned RegNo = MI->getOperand(0).getReg();
- if (TargetRegisterInfo::isVirtualRegister(RegNo)) {
+ Register RegNo = MI->getOperand(0).getReg();
+ if (Register::isVirtualRegister(RegNo)) {
OutStreamer->AddComment(Twine("implicit-def: ") +
getVirtualRegisterName(RegNo));
} else {
@@ -1397,7 +1397,7 @@ static unsigned int getOpenCLAlignment(const DataLayout &DL, Type *Ty) {
auto *FTy = dyn_cast<FunctionType>(Ty);
if (FTy)
- return DL.getPointerPrefAlignment();
+ return DL.getPointerPrefAlignment().value();
return DL.getPrefTypeAlignment(Ty);
}
@@ -1473,12 +1473,11 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// Just print .param .align <a> .b8 .param[size];
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
- unsigned align = PAL.getParamAlignment(paramIndex);
- if (align == 0)
- align = DL.getABITypeAlignment(Ty);
+ const Align align = DL.getValueOrABITypeAlignment(
+ PAL.getParamAlignment(paramIndex), Ty);
unsigned sz = DL.getTypeAllocSize(Ty);
- O << "\t.param .align " << align << " .b8 ";
+ O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
@@ -1559,9 +1558,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// Just print .param .align <a> .b8 .param[size];
// <a> = PAL.getparamalignment
// size = typeallocsize of element type
- unsigned align = PAL.getParamAlignment(paramIndex);
- if (align == 0)
- align = DL.getABITypeAlignment(ETy);
+ Align align =
+ DL.getValueOrABITypeAlignment(PAL.getParamAlignment(paramIndex), ETy);
// Work around a bug in ptxas. When PTX code takes address of
// byval parameter with alignment < 4, ptxas generates code to
// spill argument into memory. Alas on sm_50+ ptxas generates
@@ -1573,10 +1571,10 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
// TODO: this will need to be undone when we get to support multi-TU
// device-side compilation as it breaks ABI compatibility with nvcc.
// Hopefully ptxas bug is fixed by then.
- if (!isKernelFunc && align < 4)
- align = 4;
+ if (!isKernelFunc && align < Align(4))
+ align = Align(4);
unsigned sz = DL.getTypeAllocSize(ETy);
- O << "\t.param .align " << align << " .b8 ";
+ O << "\t.param .align " << align.value() << " .b8 ";
printParamName(I, paramIndex, O);
O << "[" << sz << "]";
continue;
@@ -1653,7 +1651,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
// We use the per class virtual register number in the ptx output.
unsigned int numVRs = MRI->getNumVirtRegs();
for (unsigned i = 0; i < numVRs; i++) {
- unsigned int vr = TRI->index2VirtReg(i);
+ unsigned int vr = Register::index2VirtReg(i);
const TargetRegisterClass *RC = MRI->getRegClass(vr);
DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
int n = regmap.size();
@@ -1861,7 +1859,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
case Type::HalfTyID:
case Type::FloatTyID:
case Type::DoubleTyID: {
- const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+ const auto *CFP = cast<ConstantFP>(CPV);
Type *Ty = CFP->getType();
if (Ty == Type::getHalfTy(CPV->getContext())) {
APInt API = CFP->getValueAPF().bitcastToAPInt();
@@ -2212,7 +2210,7 @@ void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
const MachineOperand &MO = MI->getOperand(opNum);
switch (MO.getType()) {
case MachineOperand::MO_Register:
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (Register::isPhysicalRegister(MO.getReg())) {
if (MO.getReg() == NVPTX::VRDepot)
O << DEPOTNAME << getFunctionNumber();
else
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 43ae57ac1262..7a66854d32f4 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -200,7 +200,7 @@ private:
const Function *F;
std::string CurrentFnName;
- void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
void EmitFunctionEntryLabel() override;
void EmitFunctionBodyStart() override;
void EmitFunctionBodyEnd() override;
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 46f08b23d31a..d26912f47e50 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -25,7 +25,7 @@
using namespace llvm;
NVPTXFrameLowering::NVPTXFrameLowering()
- : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0) {}
+ : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {}
bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ae1aa98da0e8..9acd0bea66fd 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -480,7 +480,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
setOperationAction(ISD::TRAP, MVT::Other, Legal);
// Register custom handling for vector loads/stores
- for (MVT VT : MVT::vector_valuetypes()) {
+ for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
if (IsPTXVectorType(VT)) {
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::STORE, VT, Custom);
@@ -1291,8 +1291,8 @@ std::string NVPTXTargetLowering::getPrototype(
O << ".param .b" << size << " _";
} else if (isa<PointerType>(retTy)) {
O << ".param .b" << PtrVT.getSizeInBits() << " _";
- } else if (retTy->isAggregateType() || retTy->isVectorTy() || retTy->isIntegerTy(128)) {
- auto &DL = CS.getCalledFunction()->getParent()->getDataLayout();
+ } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
+ retTy->isIntegerTy(128)) {
O << ".param .align " << retAlignment << " .b8 _["
<< DL.getTypeAllocSize(retTy) << "]";
} else {
@@ -2230,8 +2230,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType() == MVT::v2f16) {
LoadSDNode *Load = cast<LoadSDNode>(Op);
EVT MemVT = Load->getMemoryVT();
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- *Load->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, SDLoc(Op));
@@ -2273,8 +2273,8 @@ SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// v2f16 is legal, so we can't rely on legalizer to handle unaligned
// stores and have to handle it here.
if (VT == MVT::v2f16 &&
- !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *Store->getMemOperand()))
+ !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand()))
return expandUnalignedStore(Store, DAG);
if (VT.isVector())
@@ -3497,7 +3497,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
@@ -3521,7 +3521,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3547,7 +3547,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3585,7 +3585,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 4;
+ Info.align = Align(4);
return true;
}
@@ -3606,7 +3606,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3627,7 +3627,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3648,7 +3648,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3665,7 +3665,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3686,7 +3686,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3707,7 +3707,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3728,7 +3728,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
@@ -3745,7 +3745,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOStore;
- Info.align = 8;
+ Info.align = Align(8);
return true;
}
@@ -3780,7 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
- Info.align = 0;
+ Info.align.reset();
return true;
}
@@ -3798,7 +3798,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ Info.align =
+ MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
return true;
}
@@ -3817,7 +3818,8 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+ Info.align =
+ MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
return true;
}
@@ -3883,7 +3885,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_tex_1d_v4s32_s32:
@@ -4003,7 +4005,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i8_clamp:
@@ -4056,7 +4058,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i16_clamp:
@@ -4109,7 +4111,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i32_clamp:
@@ -4162,7 +4164,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
case Intrinsic::nvvm_suld_1d_i64_clamp:
@@ -4200,7 +4202,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
Info.ptrVal = nullptr;
Info.offset = 0;
Info.flags = MachineMemOperand::MOLoad;
- Info.align = 16;
+ Info.align = Align(16);
return true;
}
return false;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 62da3c79f465..fe7a84f9a361 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -143,12 +143,17 @@ def hasPTX31 : Predicate<"Subtarget->getPTXVersion() >= 31">;
def hasPTX60 : Predicate<"Subtarget->getPTXVersion() >= 60">;
def hasPTX61 : Predicate<"Subtarget->getPTXVersion() >= 61">;
def hasPTX63 : Predicate<"Subtarget->getPTXVersion() >= 63">;
+def hasPTX64 : Predicate<"Subtarget->getPTXVersion() >= 64">;
def hasSM30 : Predicate<"Subtarget->getSmVersion() >= 30">;
def hasSM70 : Predicate<"Subtarget->getSmVersion() >= 70">;
def hasSM72 : Predicate<"Subtarget->getSmVersion() >= 72">;
def hasSM75 : Predicate<"Subtarget->getSmVersion() >= 75">;
+// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
+def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
+ "&& Subtarget->getPTXVersion() >= 64)">;
+
def useShortPtr : Predicate<"useShortPointers()">;
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
@@ -2908,7 +2913,7 @@ def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
// ctz instruction always returns a 32-bit value. For ctlz.i64, convert the
// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
// truncating back down to 32 bits.
-def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i64 (ctlz Int64Regs:$a)), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
@@ -2925,10 +2930,10 @@ def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
// and then ctlz that value. This way we don't have to subtract 16 from the
// result. Unfortunately today we don't have a way to generate
// "mov b32reg, {b16imm, b16reg}", so we don't do this optimization.
-def : Pat<(ctlz Int16Regs:$a),
+def : Pat<(i16 (ctlz Int16Regs:$a)),
(SUBi16ri (CVT_u16_u32
(CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
-def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctlz Int16Regs:$a)))),
(SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
// Population count
@@ -2953,7 +2958,7 @@ def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
// If we know that we're storing into an i32, we can avoid the final trunc.
def : Pat<(ctpop Int16Regs:$a),
(CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
-def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+def : Pat<(i32 (zext (i16 (ctpop Int16Regs:$a)))),
(POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
// fpround f32 -> f16
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 1752d3e0575e..c52195fb0449 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -56,6 +56,10 @@ class RegSeq<int n, string prefix> {
[]);
}
+class THREADMASK_INFO<bit sync> {
+ list<bit> ret = !if(sync, [0,1], [0]);
+}
+
//-----------------------------------
// Synchronization and shuffle functions
//-----------------------------------
@@ -129,121 +133,64 @@ def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
Requires<[hasPTX60, hasSM30]>;
-
-// shfl.{up,down,bfly,idx}.b32
-multiclass SHFL<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- // The last two parameters to shfl can be regs or imms. ptxas is smart
- // enough to inline constant registers, so strictly speaking we don't need to
- // handle immediates here. But it's easy enough, and it makes our ptx more
- // readable.
- def reg : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def imm1 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, Int32Regs:$mask))]>;
-
- def imm2 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, Int32Regs:$offset, imm:$mask))]>;
-
- def imm3 : NVPTXInst<
- (outs regclass:$dst),
- (ins regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.", mode, ".b32 $dst, $src, $offset, $mask;"),
- [(set regclass:$dst, (IntOp regclass:$src, imm:$offset, imm:$mask))]>;
+class SHFL_INSTR<bit sync, string mode, string reg, bit return_pred,
+ bit offset_imm, bit mask_imm, bit threadmask_imm>
+ : NVPTXInst<(outs), (ins), "?", []> {
+ NVPTXRegClass rc = !cond(
+ !eq(reg, "i32"): Int32Regs,
+ !eq(reg, "f32"): Float32Regs);
+ string IntrName = "int_nvvm_shfl_"
+ # !if(sync, "sync_", "")
+ # mode
+ # "_" # reg
+ # !if(return_pred, "p", "");
+ Intrinsic Intr = !cast<Intrinsic>(IntrName);
+ let InOperandList = !con(
+ !if(sync,
+ !dag(ins, !if(threadmask_imm, [i32imm], [Int32Regs]), ["threadmask"]),
+ (ins)),
+ (ins rc:$src),
+ !dag(ins, !if(offset_imm, [i32imm], [Int32Regs]), ["offset"]),
+ !dag(ins, !if(mask_imm, [i32imm], [Int32Regs]), ["mask"])
+ );
+ let OutOperandList = !if(return_pred, (outs rc:$dst, Int1Regs:$pred), (outs rc:$dst));
+ let AsmString = "shfl."
+ # !if(sync, "sync.", "")
+ # mode # ".b32\t"
+ # "$dst"
+ # !if(return_pred, "|$pred", "") # ", "
+ # "$src, $offset, $mask"
+ # !if(sync, ", $threadmask", "")
+ # ";"
+ ;
+ let Pattern = [!con(
+ !foreach(tmp, OutOperandList,
+ !subst(outs, set,
+ !subst(i32imm, imm, tmp))),
+ (set !foreach(tmp, InOperandList,
+ !subst(ins, Intr,
+ !subst(i32imm, imm, tmp))))
+ )];
}
-defm INT_SHFL_DOWN_I32 : SHFL<Int32Regs, "down", int_nvvm_shfl_down_i32>;
-defm INT_SHFL_DOWN_F32 : SHFL<Float32Regs, "down", int_nvvm_shfl_down_f32>;
-defm INT_SHFL_UP_I32 : SHFL<Int32Regs, "up", int_nvvm_shfl_up_i32>;
-defm INT_SHFL_UP_F32 : SHFL<Float32Regs, "up", int_nvvm_shfl_up_f32>;
-defm INT_SHFL_BFLY_I32 : SHFL<Int32Regs, "bfly", int_nvvm_shfl_bfly_i32>;
-defm INT_SHFL_BFLY_F32 : SHFL<Float32Regs, "bfly", int_nvvm_shfl_bfly_f32>;
-defm INT_SHFL_IDX_I32 : SHFL<Int32Regs, "idx", int_nvvm_shfl_idx_i32>;
-defm INT_SHFL_IDX_F32 : SHFL<Float32Regs, "idx", int_nvvm_shfl_idx_f32>;
-
-multiclass SHFL_SYNC<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
- // Threadmask and the last two parameters to shfl.sync can be regs or imms.
- // ptxas is smart enough to inline constant registers, so strictly speaking we
- // don't need to handle immediates here. But it's easy enough, and it makes
- // our ptx more readable.
- def rrr : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def rri : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- Int32Regs:$offset, imm:$mask))]>;
-
- def rir : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- imm:$offset, Int32Regs:$mask))]>;
-
- def rii : NVPTXInst<
- (outs regclass:$dst),
- (ins Int32Regs:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp Int32Regs:$threadmask, regclass:$src,
- imm:$offset, imm:$mask))]>;
-
- def irr : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- Int32Regs:$offset, Int32Regs:$mask))]>;
-
- def iri : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, Int32Regs:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- Int32Regs:$offset, imm:$mask))]>;
-
- def iir : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, Int32Regs:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- imm:$offset, Int32Regs:$mask))]>;
-
- def iii : NVPTXInst<
- (outs regclass:$dst),
- (ins i32imm:$threadmask, regclass:$src, i32imm:$offset, i32imm:$mask),
- !strconcat("shfl.sync.", mode, ".b32 $dst, $src, $offset, $mask, $threadmask;"),
- [(set regclass:$dst, (IntOp imm:$threadmask, regclass:$src,
- imm:$offset, imm:$mask))]>;
+foreach sync = [0, 1] in {
+ foreach mode = ["up", "down", "bfly", "idx"] in {
+ foreach regclass = ["i32", "f32"] in {
+ foreach return_pred = [0, 1] in {
+ foreach offset_imm = [0, 1] in {
+ foreach mask_imm = [0, 1] in {
+ foreach threadmask_imm = THREADMASK_INFO<sync>.ret in {
+ def : SHFL_INSTR<sync, mode, regclass, return_pred,
+ offset_imm, mask_imm, threadmask_imm>,
+ Requires<!if(sync, [hasSM30], [hasSM30, hasSHFL])>;
+ }
+ }
+ }
+ }
+ }
+ }
}
-// On sm_70 these don't have to be convergent, so we may eventually want to
-// implement non-convergent variant of this intrinsic.
-defm INT_SHFL_SYNC_DOWN_I32 : SHFL_SYNC<Int32Regs, "down", int_nvvm_shfl_sync_down_i32>;
-defm INT_SHFL_SYNC_DOWN_F32 : SHFL_SYNC<Float32Regs, "down", int_nvvm_shfl_sync_down_f32>;
-defm INT_SHFL_SYNC_UP_I32 : SHFL_SYNC<Int32Regs, "up", int_nvvm_shfl_sync_up_i32>;
-defm INT_SHFL_SYNC_UP_F32 : SHFL_SYNC<Float32Regs, "up", int_nvvm_shfl_sync_up_f32>;
-defm INT_SHFL_SYNC_BFLY_I32 : SHFL_SYNC<Int32Regs, "bfly", int_nvvm_shfl_sync_bfly_i32>;
-defm INT_SHFL_SYNC_BFLY_F32 : SHFL_SYNC<Float32Regs, "bfly", int_nvvm_shfl_sync_bfly_f32>;
-defm INT_SHFL_SYNC_IDX_I32 : SHFL_SYNC<Int32Regs, "idx", int_nvvm_shfl_sync_idx_i32>;
-defm INT_SHFL_SYNC_IDX_F32 : SHFL_SYNC<Float32Regs, "idx", int_nvvm_shfl_sync_idx_f32>;
-
-
// vote.{all,any,uni,ballot}
multiclass VOTE<NVPTXRegClass regclass, string mode, Intrinsic IntOp> {
def : NVPTXInst<(outs regclass:$dest), (ins Int1Regs:$pred),
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 0743a2986718..83039241a7c7 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -103,7 +103,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
// Do the transformation of an aggr load/copy/set to a loop
//
for (LoadInst *LI : AggrLoads) {
- StoreInst *SI = dyn_cast<StoreInst>(*LI->user_begin());
+ auto *SI = cast<StoreInst>(*LI->user_begin());
Value *SrcAddr = LI->getOperand(0);
Value *DstAddr = SI->getOperand(1);
unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
diff --git a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
index 76fb9f3fa692..945b7286b03c 100644
--- a/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -41,12 +41,12 @@ void initializeNVPTXLowerAllocaPass(PassRegistry &);
}
namespace {
-class NVPTXLowerAlloca : public BasicBlockPass {
- bool runOnBasicBlock(BasicBlock &BB) override;
+class NVPTXLowerAlloca : public FunctionPass {
+ bool runOnFunction(Function &F) override;
public:
static char ID; // Pass identification, replacement for typeid
- NVPTXLowerAlloca() : BasicBlockPass(ID) {}
+ NVPTXLowerAlloca() : FunctionPass(ID) {}
StringRef getPassName() const override {
return "convert address space of alloca'ed memory to local";
}
@@ -61,58 +61,61 @@ INITIALIZE_PASS(NVPTXLowerAlloca, "nvptx-lower-alloca",
// =============================================================================
// Main function for this pass.
// =============================================================================
-bool NVPTXLowerAlloca::runOnBasicBlock(BasicBlock &BB) {
- if (skipBasicBlock(BB))
+bool NVPTXLowerAlloca::runOnFunction(Function &F) {
+ if (skipFunction(F))
return false;
bool Changed = false;
- for (auto &I : BB) {
- if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
- Changed = true;
- auto PTy = dyn_cast<PointerType>(allocaInst->getType());
- auto ETy = PTy->getElementType();
- auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
- auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
- auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
- auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
- GenericAddrTy, "");
- NewASCToLocal->insertAfter(allocaInst);
- NewASCToGeneric->insertAfter(NewASCToLocal);
- for (Value::use_iterator UI = allocaInst->use_begin(),
- UE = allocaInst->use_end();
- UI != UE; ) {
- // Check Load, Store, GEP, and BitCast Uses on alloca and make them
- // use the converted generic address, in order to expose non-generic
- // addrspacecast to NVPTXInferAddressSpaces. For other types
- // of instructions this is unnecessary and may introduce redundant
- // address cast.
- const auto &AllocaUse = *UI++;
- auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
- if (LI && LI->getPointerOperand() == allocaInst && !LI->isVolatile()) {
- LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
- if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
- SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
- if (GI && GI->getPointerOperand() == allocaInst) {
- GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
- continue;
- }
- auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
- if (BI && BI->getOperand(0) == allocaInst) {
- BI->setOperand(0, NewASCToGeneric);
- continue;
+ for (auto &BB : F)
+ for (auto &I : BB) {
+ if (auto allocaInst = dyn_cast<AllocaInst>(&I)) {
+ Changed = true;
+ auto PTy = dyn_cast<PointerType>(allocaInst->getType());
+ auto ETy = PTy->getElementType();
+ auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+ auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
+ auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+ auto NewASCToGeneric =
+ new AddrSpaceCastInst(NewASCToLocal, GenericAddrTy, "");
+ NewASCToLocal->insertAfter(allocaInst);
+ NewASCToGeneric->insertAfter(NewASCToLocal);
+ for (Value::use_iterator UI = allocaInst->use_begin(),
+ UE = allocaInst->use_end();
+ UI != UE;) {
+ // Check Load, Store, GEP, and BitCast Uses on alloca and make them
+ // use the converted generic address, in order to expose non-generic
+ // addrspacecast to NVPTXInferAddressSpaces. For other types
+ // of instructions this is unnecessary and may introduce redundant
+ // address cast.
+ const auto &AllocaUse = *UI++;
+ auto LI = dyn_cast<LoadInst>(AllocaUse.getUser());
+ if (LI && LI->getPointerOperand() == allocaInst &&
+ !LI->isVolatile()) {
+ LI->setOperand(LI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
+ if (SI && SI->getPointerOperand() == allocaInst &&
+ !SI->isVolatile()) {
+ SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
+ if (GI && GI->getPointerOperand() == allocaInst) {
+ GI->setOperand(GI->getPointerOperandIndex(), NewASCToGeneric);
+ continue;
+ }
+ auto BI = dyn_cast<BitCastInst>(AllocaUse.getUser());
+ if (BI && BI->getOperand(0) == allocaInst) {
+ BI->setOperand(0, NewASCToGeneric);
+ continue;
+ }
}
}
}
- }
return Changed;
}
-BasicBlockPass *llvm::createNVPTXLowerAllocaPass() {
+FunctionPass *llvm::createNVPTXLowerAllocaPass() {
return new NVPTXLowerAlloca();
}
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c5e02e34e25e..c3c5f6fbcba7 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -164,7 +164,7 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
// Set the alignment to alignment of the byval parameter. This is because,
// later load/stores assume that alignment, and we are going to replace
// the use of the byval parameter with this alloca instruction.
- AllocA->setAlignment(Func->getParamAlignment(Arg->getArgNo()));
+ AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
Arg->replaceAllUsesWith(AllocA);
Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/lib/Target/NVPTX/NVPTXPeephole.cpp b/lib/Target/NVPTX/NVPTXPeephole.cpp
index 629757db8707..5e6411c61eab 100644
--- a/lib/Target/NVPTX/NVPTXPeephole.cpp
+++ b/lib/Target/NVPTX/NVPTXPeephole.cpp
@@ -81,7 +81,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
auto &Op = Root.getOperand(1);
const auto &MRI = MF.getRegInfo();
MachineInstr *GenericAddrDef = nullptr;
- if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ if (Op.isReg() && Register::isVirtualRegister(Op.getReg())) {
GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index 4c5a9adf1f65..a7127b0e9a99 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -178,7 +178,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
// frame index registers. Functions which don't want/need this optimization
// will continue to use the existing code path.
if (MFI.getUseLocalStackAllocationBlock()) {
- unsigned Align = MFI.getLocalFrameMaxAlign();
+ unsigned Align = MFI.getLocalFrameMaxAlign().value();
// Adjust to alignment boundary.
Offset = (Offset + Align - 1) / Align * Align;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 11b3fe2fa3d3..f58fb5717773 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -116,7 +116,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
CPU, FS, Options, Reloc::PIC_,
getEffectiveCodeModel(CM, CodeModel::Small), OL),
is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
- TLOF(llvm::make_unique<NVPTXTargetObjectFile>()),
+ TLOF(std::make_unique<NVPTXTargetObjectFile>()),
Subtarget(TT, CPU, FS, *this) {
if (TT.getOS() == Triple::NVCL)
drvInterface = NVPTX::NVCL;
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 665eb1383253..43c2e9920403 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -19,10 +19,11 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/Operator.h"
#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/MutexGuard.h"
+#include "llvm/Support/Mutex.h"
#include <algorithm>
#include <cstring>
#include <map>
+#include <mutex>
#include <string>
#include <vector>
@@ -38,12 +39,12 @@ static ManagedStatic<per_module_annot_t> annotationCache;
static sys::Mutex Lock;
void clearAnnotationCache(const Module *Mod) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
annotationCache->erase(Mod);
}
static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
assert(md && "Invalid mdnode for annotation");
assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
// start index = 1, to skip the global variable key
@@ -69,7 +70,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
}
static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
NamedMDNode *NMD = m->getNamedMetadata("nvvm.annotations");
if (!NMD)
return;
@@ -103,7 +104,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
unsigned &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
const Module *m = gv->getParent();
if ((*annotationCache).find(m) == (*annotationCache).end())
cacheAnnotationFromMD(m, gv);
@@ -117,7 +118,7 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop,
std::vector<unsigned> &retval) {
- MutexGuard Guard(Lock);
+ std::lock_guard<sys::Mutex> Guard(Lock);
const Module *m = gv->getParent();
if ((*annotationCache).find(m) == (*annotationCache).end())
cacheAnnotationFromMD(m, gv);