1 files changed, 188 insertions, 40 deletions
diff --git a/lib/Driver/ToolChains/Cuda.cpp b/lib/Driver/ToolChains/Cuda.cpp
index bc4820797b2f4..d17c4c39532ad 100644
--- a/lib/Driver/ToolChains/Cuda.cpp
+++ b/lib/Driver/ToolChains/Cuda.cpp
@@ -8,18 +8,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "Cuda.h"
-#include "InputInfo.h"
 #include "CommonArgs.h"
+#include "InputInfo.h"
 #include "clang/Basic/Cuda.h"
-#include "clang/Config/config.h"
 #include "clang/Basic/VirtualFileSystem.h"
-#include "clang/Driver/Distro.h"
+#include "clang/Config/config.h"
 #include "clang/Driver/Compilation.h"
+#include "clang/Driver/Distro.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
 #include "clang/Driver/Options.h"
 #include "llvm/Option/ArgList.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Program.h"
 #include <system_error>
 
 using namespace clang::driver;
@@ -52,6 +55,10 @@ static CudaVersion ParseCudaVersionFile(llvm::StringRef V) {
     return CudaVersion::CUDA_80;
   if (Major == 9 && Minor == 0)
     return CudaVersion::CUDA_90;
+  if (Major == 9 && Minor == 1)
+    return CudaVersion::CUDA_91;
+  if (Major == 9 && Minor == 2)
+    return CudaVersion::CUDA_92;
   return CudaVersion::UNKNOWN;
 }
 
@@ -59,42 +66,75 @@ CudaInstallationDetector::CudaInstallationDetector(
     const Driver &D, const llvm::Triple &HostTriple,
     const llvm::opt::ArgList &Args)
     : D(D) {
-  SmallVector<std::string, 4> CudaPathCandidates;
+  struct Candidate {
+    std::string Path;
+    bool StrictChecking;
+
+    Candidate(std::string Path, bool StrictChecking = false)
+        : Path(Path), StrictChecking(StrictChecking) {}
+  };
+  SmallVector<Candidate, 4> Candidates;
 
   // In decreasing order so we prefer newer versions to older versions.
   std::initializer_list<const char *> Versions = {"8.0", "7.5", "7.0"};
 
   if (Args.hasArg(clang::driver::options::OPT_cuda_path_EQ)) {
-    CudaPathCandidates.push_back(
-        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ));
+    Candidates.emplace_back(
+        Args.getLastArgValue(clang::driver::options::OPT_cuda_path_EQ).str());
   } else if (HostTriple.isOSWindows()) {
     for (const char *Ver : Versions)
-      CudaPathCandidates.push_back(
+      Candidates.emplace_back(
           D.SysRoot + "/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v" +
           Ver);
   } else {
-    CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda");
+    if (!Args.hasArg(clang::driver::options::OPT_cuda_path_ignore_env)) {
+      // Try to find ptxas binary. If the executable is located in a directory
+      // called 'bin/', its parent directory might be a good guess for a valid
+      // CUDA installation.
+      // However, some distributions might installs 'ptxas' to /usr/bin. In that
+      // case the candidate would be '/usr' which passes the following checks
+      // because '/usr/include' exists as well. To avoid this case, we always
+      // check for the directory potentially containing files for libdevice,
+      // even if the user passes -nocudalib.
+      if (llvm::ErrorOr<std::string> ptxas =
+              llvm::sys::findProgramByName("ptxas")) {
+        SmallString<256> ptxasAbsolutePath;
+        llvm::sys::fs::real_path(*ptxas, ptxasAbsolutePath);
+
+        StringRef ptxasDir = llvm::sys::path::parent_path(ptxasAbsolutePath);
+        if (llvm::sys::path::filename(ptxasDir) == "bin")
+          Candidates.emplace_back(llvm::sys::path::parent_path(ptxasDir),
+                                  /*StrictChecking=*/true);
+      }
+    }
+
+    Candidates.emplace_back(D.SysRoot + "/usr/local/cuda");
     for (const char *Ver : Versions)
-      CudaPathCandidates.push_back(D.SysRoot + "/usr/local/cuda-" + Ver);
+      Candidates.emplace_back(D.SysRoot + "/usr/local/cuda-" + Ver);
 
     if (Distro(D.getVFS()).IsDebian())
       // Special case for Debian to have nvidia-cuda-toolkit work
       // out of the box. More info on http://bugs.debian.org/882505
-      CudaPathCandidates.push_back(D.SysRoot + "/usr/lib/cuda");
+      Candidates.emplace_back(D.SysRoot + "/usr/lib/cuda");
   }
 
-  for (const auto &CudaPath : CudaPathCandidates) {
-    if (CudaPath.empty() || !D.getVFS().exists(CudaPath))
+  bool NoCudaLib = Args.hasArg(options::OPT_nocudalib);
+
+  for (const auto &Candidate : Candidates) {
+    InstallPath = Candidate.Path;
+    if (InstallPath.empty() || !D.getVFS().exists(InstallPath))
       continue;
 
-    InstallPath = CudaPath;
-    BinPath = CudaPath + "/bin";
+    BinPath = InstallPath + "/bin";
     IncludePath = InstallPath + "/include";
     LibDevicePath = InstallPath + "/nvvm/libdevice";
 
     auto &FS = D.getVFS();
     if (!(FS.exists(IncludePath) && FS.exists(BinPath)))
       continue;
+    bool CheckLibDevice = (!NoCudaLib || Candidate.StrictChecking);
+    if (CheckLibDevice && !FS.exists(LibDevicePath))
+      continue;
 
     // On Linux, we have both lib and lib64 directories, and we need to choose
     // based on our triple.  On MacOS, we have only a lib directory.
@@ -119,14 +159,18 @@ CudaInstallationDetector::CudaInstallationDetector(
       Version = ParseCudaVersionFile((*VersionFile)->getBuffer());
     }
 
-    if (Version == CudaVersion::CUDA_90) {
-      // CUDA-9 uses single libdevice file for all GPU variants.
+    if (Version >= CudaVersion::CUDA_90) {
+      // CUDA-9+ uses single libdevice file for all GPU variants.
       std::string FilePath = LibDevicePath + "/libdevice.10.bc";
       if (FS.exists(FilePath)) {
-        for (const char *GpuArch :
-             {"sm_20", "sm_30", "sm_32", "sm_35", "sm_50", "sm_52", "sm_53",
-              "sm_60", "sm_61", "sm_62", "sm_70"})
-          LibDeviceMap[GpuArch] = FilePath;
+        for (const char *GpuArchName :
+             {"sm_30", "sm_32", "sm_35", "sm_37", "sm_50", "sm_52", "sm_53",
+              "sm_60", "sm_61", "sm_62", "sm_70", "sm_72"}) {
+          const CudaArch GpuArch = StringToCudaArch(GpuArchName);
+          if (Version >= MinVersionForCudaArch(GpuArch) &&
+              Version <= MaxVersionForCudaArch(GpuArch))
+            LibDeviceMap[GpuArchName] = FilePath;
+        }
       }
     } else {
       std::error_code EC;
@@ -142,7 +186,7 @@ CudaInstallationDetector::CudaInstallationDetector(
         StringRef GpuArch = FileName.slice(
             LibDeviceName.size(), FileName.find('.', LibDeviceName.size()));
         LibDeviceMap[GpuArch] = FilePath.str();
-        // Insert map entries for specifc devices with this compute
+        // Insert map entries for specific devices with this compute
         // capability. NVCC's choice of the libdevice library version is
         // rather peculiar and depends on the CUDA version.
         if (GpuArch == "compute_20") {
@@ -174,7 +218,7 @@ CudaInstallationDetector::CudaInstallationDetector(
 
     // Check that we have found at least one libdevice that we can link in if
     // -nocudalib hasn't been specified.
-    if (LibDeviceMap.empty() && !Args.hasArg(options::OPT_nocudalib))
+    if (LibDeviceMap.empty() && !NoCudaLib)
       continue;
 
     IsValid = true;
@@ -231,6 +275,35 @@ void CudaInstallationDetector::print(raw_ostream &OS) const {
        << CudaVersionToString(Version) << "\n";
 }
 
+namespace {
+  /// Debug info kind.
+enum DebugInfoKind {
+  NoDebug,       /// No debug info.
+  LineTableOnly, /// Line tables only.
+  FullDebug      /// Full debug info.
+};
+} // anonymous namespace
+
+static DebugInfoKind mustEmitDebugInfo(const ArgList &Args) {
+  Arg *A = Args.getLastArg(options::OPT_O_Group);
+  if (Args.hasFlag(options::OPT_cuda_noopt_device_debug,
+                   options::OPT_no_cuda_noopt_device_debug,
+                   !A || A->getOption().matches(options::OPT_O0))) {
+    if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {
+      const Option &Opt = A->getOption();
+      if (Opt.matches(options::OPT_gN_Group)) {
+        if (Opt.matches(options::OPT_g0) || Opt.matches(options::OPT_ggdb0))
+          return NoDebug;
+        if (Opt.matches(options::OPT_gline_tables_only) ||
+            Opt.matches(options::OPT_ggdb1))
+          return LineTableOnly;
+      }
+      return FullDebug;
+    }
+  }
+  return NoDebug;
+}
+
 void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                     const InputInfo &Output,
                                     const InputInfoList &Inputs,
@@ -262,8 +335,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   ArgStringList CmdArgs;
   CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-m64" : "-m32");
-  if (Args.hasFlag(options::OPT_cuda_noopt_device_debug,
-                   options::OPT_no_cuda_noopt_device_debug, false)) {
+  DebugInfoKind DIKind = mustEmitDebugInfo(Args);
+  if (DIKind == FullDebug) {
     // ptxas does not accept -g option if optimization is enabled, so
     // we ignore the compiler's -O* options if we want debug info.
     CmdArgs.push_back("-g");
@@ -299,6 +372,8 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     // to no optimizations, but ptxas's default is -O3.
     CmdArgs.push_back("-O0");
   }
+  if (DIKind == LineTableOnly)
+    CmdArgs.push_back("-lineinfo");
 
   // Pass -v to ptxas if it was passed to the driver.
   if (Args.hasArg(options::OPT_v))
@@ -314,11 +389,17 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   for (const auto& A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
     CmdArgs.push_back(Args.MakeArgString(A));
 
-  // In OpenMP we need to generate relocatable code.
-  if (JA.isOffloading(Action::OFK_OpenMP) &&
-      Args.hasFlag(options::OPT_fopenmp_relocatable_target,
-                   options::OPT_fnoopenmp_relocatable_target,
-                   /*Default=*/ true))
+  bool Relocatable = false;
+  if (JA.isOffloading(Action::OFK_OpenMP))
+    // In OpenMP we need to generate relocatable code.
+    Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
+                               options::OPT_fnoopenmp_relocatable_target,
+                               /*Default=*/true);
+  else if (JA.isOffloading(Action::OFK_Cuda))
+    Relocatable = Args.hasFlag(options::OPT_fcuda_rdc,
+                               options::OPT_fno_cuda_rdc, /*Default=*/false);
+
+  if (Relocatable)
     CmdArgs.push_back("-c");
 
   const char *Exec;
@@ -329,6 +410,22 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
 }
 
+static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
+  bool includePTX = true;
+  for (Arg *A : Args) {
+    if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
+          A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
+      continue;
+    A->claim();
+    const StringRef ArchStr = A->getValue();
+    if (ArchStr == "all" || ArchStr == gpu_arch) {
+      includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
+      continue;
+    }
+  }
+  return includePTX;
+}
+
 // All inputs to this linker must be from CudaDeviceActions, as we need to look
 // at the Inputs' Actions in order to figure out which GPU architecture they
 // correspond to.
@@ -346,6 +443,8 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(TC.getTriple().isArch64Bit() ? "-64" : "-32");
   CmdArgs.push_back(Args.MakeArgString("--create"));
   CmdArgs.push_back(Args.MakeArgString(Output.getFilename()));
+  if (mustEmitDebugInfo(Args) == FullDebug)
+    CmdArgs.push_back("-g");
 
   for (const auto& II : Inputs) {
     auto *A = II.getAction();
@@ -356,6 +455,9 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
            "Device action expected to have associated a GPU architecture!");
     CudaArch gpu_arch = StringToCudaArch(gpu_arch_str);
 
+    if (II.getType() == types::TY_PP_Asm &&
+        !shouldIncludePTX(Args, gpu_arch_str))
+      continue;
     // We need to pass an Arch of the form "sm_XX" for cubin files and
     // "compute_XX" for ptx.
     const char *Arch =
@@ -394,7 +496,7 @@ void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Output.getFilename());
   } else
     assert(Output.isNothing() && "Invalid output.");
-  if (Args.hasArg(options::OPT_g_Flag))
+  if (mustEmitDebugInfo(Args) == FullDebug)
     CmdArgs.push_back("-g");
 
   if (Args.hasArg(options::OPT_v))
@@ -499,6 +601,10 @@ void CudaToolChain::addClangTargetOptions(
     if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
                            options::OPT_fno_cuda_approx_transcendentals, false))
       CC1Args.push_back("-fcuda-approx-transcendentals");
+
+    if (DriverArgs.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc,
+                           false))
+      CC1Args.push_back("-fcuda-rdc");
   }
 
   if (DriverArgs.hasArg(options::OPT_nocudalib))
@@ -518,16 +624,58 @@ void CudaToolChain::addClangTargetOptions(
   CC1Args.push_back("-mlink-cuda-bitcode");
   CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
 
-  if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
-    // CUDA-9 uses new instructions that are only available in PTX6.0
-    CC1Args.push_back("-target-feature");
-    CC1Args.push_back("+ptx60");
-  } else {
-    // Libdevice in CUDA-7.0 requires PTX version that's more recent
-    // than LLVM defaults to. Use PTX4.2 which is the PTX version that
-    // came with CUDA-7.0.
-    CC1Args.push_back("-target-feature");
-    CC1Args.push_back("+ptx42");
+  // Libdevice in CUDA-7.0 requires PTX version that's more recent than LLVM
+  // defaults to. Use PTX4.2 by default, which is the PTX version that came with
+  // CUDA-7.0.
+  const char *PtxFeature = "+ptx42";
+  if (CudaInstallation.version() >= CudaVersion::CUDA_91) {
+    // CUDA-9.1 uses new instructions that are only available in PTX6.1+
+    PtxFeature = "+ptx61";
+  } else if (CudaInstallation.version() >= CudaVersion::CUDA_90) {
+    // CUDA-9.0 uses new instructions that are only available in PTX6.0+
+    PtxFeature = "+ptx60";
+  }
+  CC1Args.append({"-target-feature", PtxFeature});
+  if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,
+                         options::OPT_fno_cuda_short_ptr, false))
+    CC1Args.append({"-mllvm", "--nvptx-short-ptr"});
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+    SmallVector<StringRef, 8> LibraryPaths;
+    // Add path to lib and/or lib64 folders.
+    SmallString<256> DefaultLibPath =
+      llvm::sys::path::parent_path(getDriver().Dir);
+    llvm::sys::path::append(DefaultLibPath,
+        Twine("lib") + CLANG_LIBDIR_SUFFIX);
+    LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+    // Add user defined library paths from LIBRARY_PATH.
+    llvm::Optional<std::string> LibPath =
+        llvm::sys::Process::GetEnv("LIBRARY_PATH");
+    if (LibPath) {
+      SmallVector<StringRef, 8> Frags;
+      const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+      llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+      for (StringRef Path : Frags)
+        LibraryPaths.emplace_back(Path.trim());
+    }
+
+    std::string LibOmpTargetName =
+      "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+    bool FoundBCLibrary = false;
+    for (StringRef LibraryPath : LibraryPaths) {
+      SmallString<128> LibOmpTargetFile(LibraryPath);
+      llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+      if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+        CC1Args.push_back("-mlink-cuda-bitcode");
+        CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+        FoundBCLibrary = true;
+        break;
+      }
+    }
+    if (!FoundBCLibrary)
+      getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime)
+          << LibOmpTargetName;
   }
 }