diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2017-01-06 20:13:35 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2017-01-06 20:13:35 +0000 |
commit | 6694ed095d6b27a2c92ec4fd63664fcd88a05749 (patch) | |
tree | 0633c29bd8350e306f3a24a30f3f6045efd35420 /lib/CodeGen | |
parent | d5dc75c5cf109efe52b1da32ec44a667389a0f0a (diff) | |
download | src-test2-6694ed095d6b27a2c92ec4fd63664fcd88a05749.tar.gz src-test2-6694ed095d6b27a2c92ec4fd63664fcd88a05749.zip |
Notes
Diffstat (limited to 'lib/CodeGen')
-rw-r--r-- | lib/CodeGen/BackendUtil.cpp | 30 | ||||
-rw-r--r-- | lib/CodeGen/CGBuiltin.cpp | 84 | ||||
-rw-r--r-- | lib/CodeGen/CGCall.cpp | 6 | ||||
-rw-r--r-- | lib/CodeGen/CGExpr.cpp | 11 | ||||
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntime.cpp | 23 | ||||
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntime.h | 36 | ||||
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp | 270 | ||||
-rw-r--r-- | lib/CodeGen/CGOpenMPRuntimeNVPTX.h | 51 | ||||
-rw-r--r-- | lib/CodeGen/CodeGenAction.cpp | 16 | ||||
-rw-r--r-- | lib/CodeGen/CodeGenFunction.h | 2 | ||||
-rw-r--r-- | lib/CodeGen/ObjectFilePCHContainerOperations.cpp | 9 | ||||
-rw-r--r-- | lib/CodeGen/TargetInfo.cpp | 206 |
12 files changed, 505 insertions, 239 deletions
diff --git a/lib/CodeGen/BackendUtil.cpp b/lib/CodeGen/BackendUtil.cpp index 164e52d7de27..ed09f3a45566 100644 --- a/lib/CodeGen/BackendUtil.cpp +++ b/lib/CodeGen/BackendUtil.cpp @@ -14,6 +14,7 @@ #include "clang/Frontend/CodeGenOptions.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "clang/Frontend/Utils.h" +#include "clang/Lex/HeaderSearchOptions.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" @@ -32,6 +33,7 @@ #include "llvm/IR/ModuleSummaryIndex.h" #include "llvm/IR/Verifier.h" #include "llvm/LTO/LTOBackend.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Object/ModuleSummaryIndexObjectFile.h" #include "llvm/Passes/PassBuilder.h" @@ -61,6 +63,7 @@ namespace { class EmitAssemblyHelper { DiagnosticsEngine &Diags; + const HeaderSearchOptions &HSOpts; const CodeGenOptions &CodeGenOpts; const clang::TargetOptions &TargetOpts; const LangOptions &LangOpts; @@ -100,11 +103,14 @@ private: raw_pwrite_stream &OS); public: - EmitAssemblyHelper(DiagnosticsEngine &_Diags, const CodeGenOptions &CGOpts, + EmitAssemblyHelper(DiagnosticsEngine &_Diags, + const HeaderSearchOptions &HeaderSearchOpts, + const CodeGenOptions &CGOpts, const clang::TargetOptions &TOpts, const LangOptions &LOpts, Module *M) - : Diags(_Diags), CodeGenOpts(CGOpts), TargetOpts(TOpts), LangOpts(LOpts), - TheModule(M), CodeGenerationTime("codegen", "Code Generation Time") {} + : Diags(_Diags), HSOpts(HeaderSearchOpts), CodeGenOpts(CGOpts), + TargetOpts(TOpts), LangOpts(LOpts), TheModule(M), + CodeGenerationTime("codegen", "Code Generation Time") {} ~EmitAssemblyHelper() { if (CodeGenOpts.DisableFree) @@ -584,12 +590,18 @@ void EmitAssemblyHelper::CreateTargetMachine(bool MustCreateTM) { Options.MCOptions.MCNoExecStack = CodeGenOpts.NoExecStack; Options.MCOptions.MCIncrementalLinkerCompatible = CodeGenOpts.IncrementalLinkerCompatible; - Options.MCOptions.MCPIECopyRelocations = - CodeGenOpts.PIECopyRelocations; + Options.MCOptions.MCPIECopyRelocations = CodeGenOpts.PIECopyRelocations; Options.MCOptions.MCFatalWarnings = CodeGenOpts.FatalWarnings; Options.MCOptions.AsmVerbose = CodeGenOpts.AsmVerbose; Options.MCOptions.PreserveAsmComments = CodeGenOpts.PreserveAsmComments; Options.MCOptions.ABIName = TargetOpts.ABI; + for (const auto &Entry : HSOpts.UserEntries) + if (!Entry.IsFramework && + (Entry.Group == frontend::IncludeDirGroup::Quoted || + Entry.Group == frontend::IncludeDirGroup::Angled || + Entry.Group == frontend::IncludeDirGroup::System)) + Options.MCOptions.IASSearchPaths.push_back( + Entry.IgnoreSysRoot ? Entry.Path : HSOpts.Sysroot + Entry.Path); TM.reset(TheTarget->createTargetMachine(Triple, TargetOpts.CPU, FeaturesStr, Options, RM, CM, OptLevel)); @@ -929,17 +941,19 @@ static void runThinLTOBackend(const CodeGenOptions &CGOpts, Module *M, } void clang::EmitBackendOutput(DiagnosticsEngine &Diags, + const HeaderSearchOptions &HeaderOpts, const CodeGenOptions &CGOpts, const clang::TargetOptions &TOpts, - const LangOptions &LOpts, const llvm::DataLayout &TDesc, - Module *M, BackendAction Action, + const LangOptions &LOpts, + const llvm::DataLayout &TDesc, Module *M, + BackendAction Action, std::unique_ptr<raw_pwrite_stream> OS) { if (!CGOpts.ThinLTOIndexFile.empty()) { runThinLTOBackend(CGOpts, M, std::move(OS)); return; } - EmitAssemblyHelper AsmHelper(Diags, CGOpts, TOpts, LOpts, M); + EmitAssemblyHelper AsmHelper(Diags, HeaderOpts, CGOpts, TOpts, LOpts, M); if (CGOpts.ExperimentalNewPassManager) AsmHelper.EmitAssemblyWithNewPassManager(Action, std::move(OS)); diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 43ca74761fbd..4d34b3e9222f 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -35,6 +35,11 @@ using namespace clang; using namespace CodeGen; using namespace llvm; +static +int64_t clamp(int64_t Value, int64_t Low, int64_t High) { + return std::min(High, std::max(Low, Value)); +} + /// getBuiltinLibFunction - Given a builtin id for a function like /// "__builtin_fabsf", return a Function* for "fabsf". llvm::Constant *CodeGenModule::getBuiltinLibFunction(const FunctionDecl *FD, @@ -8191,6 +8196,85 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID, llvm_unreachable("Unknown FMA operation"); return nullptr; // Suppress no-return warning } + + case PPC::BI__builtin_vsx_insertword: { + llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxinsertw); + + // Third argument is a compile time constant int. It must be clamped to + // to the range [0, 12]. + ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[2]); + assert(ArgCI && + "Third arg to xxinsertw intrinsic must be constant integer"); + const int64_t MaxIndex = 12; + int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex); + + // The builtin semantics don't exactly match the xxinsertw instructions + // semantics (which ppc_vsx_xxinsertw follows). The builtin extracts the + // word from the first argument, and inserts it in the second argument. The + // instruction extracts the word from its second input register and inserts + // it into its first input register, so swap the first and second arguments. + std::swap(Ops[0], Ops[1]); + + // Need to cast the second argument from a vector of unsigned int to a + // vector of long long. + Ops[1] = Builder.CreateBitCast(Ops[1], llvm::VectorType::get(Int64Ty, 2)); + + if (getTarget().isLittleEndian()) { + // Create a shuffle mask of (1, 0) + Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1), + ConstantInt::get(Int32Ty, 0) + }; + Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts); + + // Reverse the double words in the vector we will extract from. + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2)); + Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ShuffleMask); + + // Reverse the index. + Index = MaxIndex - Index; + } + + // Intrinsic expects the first arg to be a vector of int. + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int32Ty, 4)); + Ops[2] = ConstantInt::getSigned(Int32Ty, Index); + return Builder.CreateCall(F, Ops); + } + + case PPC::BI__builtin_vsx_extractuword: { + llvm::Function *F = CGM.getIntrinsic(Intrinsic::ppc_vsx_xxextractuw); + + // Intrinsic expects the first argument to be a vector of doublewords. + Ops[0] = Builder.CreateBitCast(Ops[0], llvm::VectorType::get(Int64Ty, 2)); + + // The second argument is a compile time constant int that needs to + // be clamped to the range [0, 12]. + ConstantInt *ArgCI = dyn_cast<ConstantInt>(Ops[1]); + assert(ArgCI && + "Second Arg to xxextractuw intrinsic must be a constant integer!"); + const int64_t MaxIndex = 12; + int64_t Index = clamp(ArgCI->getSExtValue(), 0, MaxIndex); + + if (getTarget().isLittleEndian()) { + // Reverse the index. + Index = MaxIndex - Index; + Ops[1] = ConstantInt::getSigned(Int32Ty, Index); + + // Emit the call, then reverse the double words of the results vector. + Value *Call = Builder.CreateCall(F, Ops); + + // Create a shuffle mask of (1, 0) + Constant *ShuffleElts[2] = { ConstantInt::get(Int32Ty, 1), + ConstantInt::get(Int32Ty, 0) + }; + Constant *ShuffleMask = llvm::ConstantVector::get(ShuffleElts); + + Value *ShuffleCall = Builder.CreateShuffleVector(Call, Call, ShuffleMask); + return ShuffleCall; + } else { + Ops[1] = ConstantInt::getSigned(Int32Ty, Index); + return Builder.CreateCall(F, Ops); + } + } } } diff --git a/lib/CodeGen/CGCall.cpp b/lib/CodeGen/CGCall.cpp index 9b96a59aec38..c7c61e0c8ecb 100644 --- a/lib/CodeGen/CGCall.cpp +++ b/lib/CodeGen/CGCall.cpp @@ -393,15 +393,13 @@ CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { // When declaring a function without a prototype, always use a // non-variadic type. - if (isa<FunctionNoProtoType>(FTy)) { - CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>(); + if (CanQual<FunctionNoProtoType> noProto = FTy.getAs<FunctionNoProtoType>()) { return arrangeLLVMFunctionInfo( noProto->getReturnType(), /*instanceMethod=*/false, /*chainCall=*/false, None, noProto->getExtInfo(), {},RequiredArgs::All); } - assert(isa<FunctionProtoType>(FTy)); - return arrangeFreeFunctionType(FTy.getAs<FunctionProtoType>(), FD); + return arrangeFreeFunctionType(FTy.castAs<FunctionProtoType>(), FD); } /// Arrange the argument and result information for the declaration or diff --git a/lib/CodeGen/CGExpr.cpp b/lib/CodeGen/CGExpr.cpp index 183201c78e36..e5e34a5f3ed6 100644 --- a/lib/CodeGen/CGExpr.cpp +++ b/lib/CodeGen/CGExpr.cpp @@ -604,12 +604,13 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc, } if (Checks.size() > 0) { + // Make sure we're not losing information. Alignment needs to be a power of + // 2 + assert(!AlignVal || (uint64_t)1 << llvm::Log2_64(AlignVal) == AlignVal); llvm::Constant *StaticData[] = { - EmitCheckSourceLocation(Loc), - EmitCheckTypeDescriptor(Ty), - llvm::ConstantInt::get(SizeTy, AlignVal), - llvm::ConstantInt::get(Int8Ty, TCK) - }; + EmitCheckSourceLocation(Loc), EmitCheckTypeDescriptor(Ty), + llvm::ConstantInt::get(Int8Ty, AlignVal ? llvm::Log2_64(AlignVal) : 1), + llvm::ConstantInt::get(Int8Ty, TCK)}; EmitCheck(Checks, SanitizerHandler::TypeMismatch, StaticData, Ptr); } diff --git a/lib/CodeGen/CGOpenMPRuntime.cpp b/lib/CodeGen/CGOpenMPRuntime.cpp index 0624d86b564a..27af344fae87 100644 --- a/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/lib/CodeGen/CGOpenMPRuntime.cpp @@ -2701,14 +2701,16 @@ void CGOpenMPRuntime::OffloadEntriesInfoManagerTy:: "only required for the device " "code generation."); OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = - OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr); + OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr, + /*Flags=*/0); ++OffloadingEntriesNum; } void CGOpenMPRuntime::OffloadEntriesInfoManagerTy:: registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum, - llvm::Constant *Addr, llvm::Constant *ID) { + llvm::Constant *Addr, llvm::Constant *ID, + int32_t Flags) { // If we are emitting code for a target, the entry is already initialized, // only has to be registered. if (CGM.getLangOpts().OpenMPIsDevice) { @@ -2719,9 +2721,10 @@ void CGOpenMPRuntime::OffloadEntriesInfoManagerTy:: assert(Entry.isValid() && "Entry not initialized!"); Entry.setAddress(Addr); Entry.setID(ID); + Entry.setFlags(Flags); return; } else { - OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID); + OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum++, Addr, ID, Flags); OffloadEntriesTargetRegion[DeviceID][FileID][ParentName][LineNum] = Entry; } } @@ -2888,7 +2891,8 @@ CGOpenMPRuntime::createOffloadingBinaryDescriptorRegistration() { } void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID, - llvm::Constant *Addr, uint64_t Size) { + llvm::Constant *Addr, uint64_t Size, + int32_t Flags) { StringRef Name = Addr->getName(); auto *TgtOffloadEntryType = cast<llvm::StructType>( CGM.getTypes().ConvertTypeForMem(getTgtOffloadEntryQTy())); @@ -2918,6 +2922,8 @@ void CGOpenMPRuntime::createOffloadEntry(llvm::Constant *ID, EntryInit.add(AddrPtr); EntryInit.add(StrPtr); EntryInit.addInt(CGM.SizeTy, Size); + EntryInit.addInt(CGM.Int32Ty, Flags); + EntryInit.addInt(CGM.Int32Ty, 0); llvm::GlobalVariable *Entry = EntryInit.finishAndCreateGlobal(".omp_offloading.entry", Align, @@ -3090,6 +3096,8 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() { // // (function or global) // char *name; // Name of the function or global. // size_t size; // Size of the entry info (0 if it a function). + // int32_t flags; // Flags associated with the entry, e.g. 'link'. + // int32_t reserved; // Reserved, to use by the runtime library. // }; if (TgtOffloadEntryQTy.isNull()) { ASTContext &C = CGM.getContext(); @@ -3098,6 +3106,10 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() { addFieldToRecordDecl(C, RD, C.VoidPtrTy); addFieldToRecordDecl(C, RD, C.getPointerType(C.CharTy)); addFieldToRecordDecl(C, RD, C.getSizeType()); + addFieldToRecordDecl( + C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); + addFieldToRecordDecl( + C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true)); RD->completeDefinition(); TgtOffloadEntryQTy = C.getRecordType(RD); } @@ -4852,7 +4864,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper( // Register the information for the entry associated with this target region. OffloadEntriesInfoManager.registerTargetRegionEntryInfo( - DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID); + DeviceID, FileID, ParentName, Line, OutlinedFn, OutlinedFnID, + /*Flags=*/0); } /// discard all CompoundStmts intervening between two constructs diff --git a/lib/CodeGen/CGOpenMPRuntime.h b/lib/CodeGen/CGOpenMPRuntime.h index 9057e5ec4c14..9a784dff0ae8 100644 --- a/lib/CodeGen/CGOpenMPRuntime.h +++ b/lib/CodeGen/CGOpenMPRuntime.h @@ -110,9 +110,9 @@ protected: CodeGenModule &CGM; /// \brief Creates offloading entry for the provided entry ID \a ID, - /// address \a Addr and size \a Size. + /// address \a Addr, size \a Size, and flags \a Flags. virtual void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, - uint64_t Size); + uint64_t Size, int32_t Flags = 0); /// \brief Helper to emit outlined function for 'target' directive. /// \param D Directive to emit. @@ -245,10 +245,10 @@ private: unsigned OffloadingEntriesNum; public: - /// \brief Base class of the entries info. + /// Base class of the entries info. class OffloadEntryInfo { public: - /// \brief Kind of a given entry. Currently, only target regions are + /// Kind of a given entry. Currently, only target regions are /// supported. enum OffloadingEntryInfoKinds : unsigned { // Entry is a target region. @@ -257,17 +257,24 @@ private: OFFLOAD_ENTRY_INFO_INVALID = ~0u }; - OffloadEntryInfo() : Order(~0u), Kind(OFFLOAD_ENTRY_INFO_INVALID) {} - explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order) - : Order(Order), Kind(Kind) {} + OffloadEntryInfo() + : Flags(0), Order(~0u), Kind(OFFLOAD_ENTRY_INFO_INVALID) {} + explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, + int32_t Flags) + : Flags(Flags), Order(Order), Kind(Kind) {} bool isValid() const { return Order != ~0u; } unsigned getOrder() const { return Order; } OffloadingEntryInfoKinds getKind() const { return Kind; } + int32_t getFlags() const { return Flags; } + void setFlags(int32_t NewFlags) { Flags = NewFlags; } static bool classof(const OffloadEntryInfo *Info) { return true; } - protected: - // \brief Order this entry was emitted. + private: + /// Flags associated with the device global. + int32_t Flags; + + /// Order this entry was emitted. unsigned Order; OffloadingEntryInfoKinds Kind; @@ -292,12 +299,13 @@ private: public: OffloadEntryInfoTargetRegion() - : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, ~0u), + : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, ~0u, + /*Flags=*/0), Addr(nullptr), ID(nullptr) {} explicit OffloadEntryInfoTargetRegion(unsigned Order, llvm::Constant *Addr, - llvm::Constant *ID) - : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, Order), + llvm::Constant *ID, int32_t Flags) + : OffloadEntryInfo(OFFLOAD_ENTRY_INFO_TARGET_REGION, Order, Flags), Addr(Addr), ID(ID) {} llvm::Constant *getAddress() const { return Addr; } @@ -321,8 +329,8 @@ private: /// \brief Register target region entry. void registerTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID, StringRef ParentName, unsigned LineNum, - llvm::Constant *Addr, - llvm::Constant *ID); + llvm::Constant *Addr, llvm::Constant *ID, + int32_t Flags); /// \brief Return true if a target region entry with the provided /// information exists. bool hasTargetRegionEntryInfo(unsigned DeviceID, unsigned FileID, diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index fe0e2acdfdbf..bc1458b1c203 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -22,14 +22,10 @@ using namespace CodeGen; namespace { enum OpenMPRTLFunctionNVPTX { - /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle, - /// kmp_int32 thread_limit); + /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); OMPRTL_NVPTX__kmpc_kernel_init, -}; - -// NVPTX Address space -enum AddressSpace { - AddressSpaceShared = 3, + /// \brief Call to void __kmpc_kernel_deinit(); + OMPRTL_NVPTX__kmpc_kernel_deinit, }; } // namespace @@ -70,6 +66,15 @@ static void getNVPTXCTABarrier(CodeGenFunction &CGF) { /// Synchronize all GPU threads in a block. static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } +/// Get the value of the thread_limit clause in the teams directive. +/// The runtime encodes thread_limit in the launch parameter, always starting +/// thread_limit+warpSize threads per team. +static llvm::Value *getThreadLimit(CodeGenFunction &CGF) { + CGBuilderTy &Bld = CGF.Builder; + return Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), + "thread_limit"); +} + /// Get the thread id of the OMP master thread. /// The master thread id is the first thread (lane) of the last warp in the /// GPU block. Warp size is assumed to be some power of 2. @@ -103,35 +108,105 @@ void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, /* placeholder */ "_worker", &CGM.getModule()); CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); - WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage); - WorkerFn->addFnAttr(llvm::Attribute::NoInline); } -void CGOpenMPRuntimeNVPTX::initializeEnvironment() { - // - // Initialize master-worker control state in shared memory. - // +void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, + StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, + bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen) { + EntryFunctionState EST; + WorkerFunctionState WST(CGM); + + // Emit target region as a standalone region. + class NVPTXPrePostActionTy : public PrePostActionTy { + CGOpenMPRuntimeNVPTX &RT; + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; + CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; + + public: + NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, + CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, + CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) + : RT(RT), EST(EST), WST(WST) {} + void Enter(CodeGenFunction &CGF) override { + RT.emitGenericEntryHeader(CGF, EST, WST); + } + void Exit(CodeGenFunction &CGF) override { + RT.emitGenericEntryFooter(CGF, EST); + } + } Action(*this, EST, WST); + CodeGen.setAction(Action); + emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, + IsOffloadEntry, CodeGen); - auto DL = CGM.getDataLayout(); - ActiveWorkers = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0, - llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared); - ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty)); - - WorkID = new llvm::GlobalVariable( - CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false, - llvm::GlobalValue::CommonLinkage, - llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0, - llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared); - WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty)); + // Create the worker function + emitWorkerFunction(WST); + + // Now change the name of the worker function to correspond to this target + // region's entry function. + WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); +} + +// Setup NVPTX threads for master-worker OpenMP scheme. +void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, + EntryFunctionState &EST, + WorkerFunctionState &WST) { + CGBuilderTy &Bld = CGF.Builder; + + llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); + llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); + llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); + EST.ExitBB = CGF.createBasicBlock(".exit"); + + auto *IsWorker = + Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); + Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); + + CGF.EmitBlock(WorkerBB); + CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(MasterCheckBB); + auto *IsMaster = + Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); + Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); + + CGF.EmitBlock(MasterBB); + // First action in sequential region: + // Initialize the state of the OpenMP runtime library on the GPU. + llvm::Value *Args[] = {getThreadLimit(CGF)}; + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); +} + +void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, + EntryFunctionState &EST) { + if (!EST.ExitBB) + EST.ExitBB = CGF.createBasicBlock(".exit"); + + llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); + CGF.EmitBranch(TerminateBB); + + CGF.EmitBlock(TerminateBB); + // Signal termination condition. + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); + // Barrier to terminate worker threads. + syncCTAThreads(CGF); + // Master thread jumps to exit point. + CGF.EmitBranch(EST.ExitBB); + + CGF.EmitBlock(EST.ExitBB); + EST.ExitBB = nullptr; } void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { auto &Ctx = CGM.getContext(); CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); + CGF.disableDebugInfo(); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); emitWorkerLoop(CGF, WST); CGF.FinishFunction(); @@ -163,21 +238,26 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(AwaitBB); // Wait for parallel work syncCTAThreads(CGF); + + Address WorkFn = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); + Address ExecStatus = + CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); + CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); + CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); + + // TODO: Call into runtime to get parallel work. + // On termination condition (workid == 0), exit loop. - llvm::Value *ShouldTerminate = Bld.CreateICmpEQ( - Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()), - llvm::Constant::getNullValue(WorkID->getType()->getElementType()), - "should_terminate"); + llvm::Value *ShouldTerminate = + Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); // Activate requested workers. CGF.EmitBlock(SelectWorkersBB); - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - llvm::Value *ActiveThread = Bld.CreateICmpSLT( - ThreadID, - Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()), - "active_thread"); - Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB); + llvm::Value *IsActive = + Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); + Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); // Signal start of parallel region. CGF.EmitBlock(ExecuteBB); @@ -197,72 +277,6 @@ void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, CGF.EmitBlock(ExitBB); } -// Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF, - EntryFunctionState &EST, - WorkerFunctionState &WST) { - CGBuilderTy &Bld = CGF.Builder; - - // Get the master thread id. - llvm::Value *MasterID = getMasterThreadID(CGF); - // Current thread's identifier. - llvm::Value *ThreadID = getNVPTXThreadID(CGF); - - // Setup BBs in entry function. - llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker"); - llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - // The head (master thread) marches on while its body of companion threads in - // the warp go to sleep. - llvm::Value *ShouldDie = - Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp"); - Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB); - - // Select worker threads... - CGF.EmitBlock(WorkerCheckBB); - llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker"); - Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB); - - // ... and send to worker loop, awaiting parallel invocation. - CGF.EmitBlock(WorkerBB); - CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); - CGF.EmitBranch(EST.ExitBB); - - // Only master thread executes subsequent serial code. - CGF.EmitBlock(MasterBB); - - // First action in sequential region: - // Initialize the state of the OpenMP runtime library on the GPU. - llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), - Args); -} - -void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - CGBuilderTy &Bld = CGF.Builder; - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(TerminateBB); - // Signal termination condition. - Bld.CreateAlignedStore( - llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID, - WorkID->getAlignment()); - // Barrier to terminate worker threads. - syncCTAThreads(CGF); - // Master thread jumps to exit point. - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -272,21 +286,27 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { case OMPRTL_NVPTX__kmpc_kernel_init: { - // Build void __kmpc_kernel_init(kmp_int32 omp_handle, - // kmp_int32 thread_limit); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty}; + // Build void __kmpc_kernel_init(kmp_int32 thread_limit); + llvm::Type *TypeParams[] = {CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); break; } + case OMPRTL_NVPTX__kmpc_kernel_deinit: { + // Build void __kmpc_kernel_deinit(); + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.VoidTy, {}, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); + break; + } } return RTLFn; } void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, - uint64_t Size) { + uint64_t Size, int32_t) { auto *F = dyn_cast<llvm::Function>(Addr); // TODO: Add support for global variables on the device after declare target // support. @@ -315,44 +335,14 @@ void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( assert(!ParentName.empty() && "Invalid target region parent name!"); - EntryFunctionState EST; - WorkerFunctionState WST(CGM); - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX &RT; - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; - - public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) - : RT(RT), EST(EST), WST(WST) {} - void Enter(CodeGenFunction &CGF) override { - RT.emitEntryHeader(CGF, EST, WST); - } - void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); } - } Action(*this, EST, WST); - CodeGen.setAction(Action); - emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, - IsOffloadEntry, CodeGen); - - // Create the worker function - emitWorkerFunction(WST); - - // Now change the name of the worker function to correspond to this target - // region's entry function. - WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); + emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + CodeGen); } CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) - : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) { + : CGOpenMPRuntime(CGM) { if (!CGM.getLangOpts().OpenMPIsDevice) llvm_unreachable("OpenMP NVPTX can only handle device code."); - - // Called once per module during initialization. - initializeEnvironment(); } void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, diff --git a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h index a33fb27579f6..63a02965a5bd 100644 --- a/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ b/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -24,7 +24,7 @@ namespace clang { namespace CodeGen { class CGOpenMPRuntimeNVPTX : public CGOpenMPRuntime { -public: +private: struct EntryFunctionState { llvm::BasicBlock *ExitBB = nullptr; }; @@ -40,34 +40,21 @@ public: void createWorkerFunction(CodeGenModule &CGM); }; - /// \brief Helper for target entry function. Guide the master and worker - /// threads to their respective locations. - void emitEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - WorkerFunctionState &WST); - - /// \brief Signal termination of OMP execution. - void emitEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - -private: - // - // Private state and methods. - // - - // Master-worker control state. - // Number of requested OMP threads in parallel region. - llvm::GlobalVariable *ActiveWorkers; - // Outlined function for the workers to execute. - llvm::GlobalVariable *WorkID; - - /// \brief Initialize master-worker control state. - void initializeEnvironment(); - /// \brief Emit the worker function for the current target region. void emitWorkerFunction(WorkerFunctionState &WST); /// \brief Helper for worker function. Emit body of worker loop. void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); + /// \brief Helper for generic target entry function. Guide the master and + /// worker threads to their respective locations. + void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + WorkerFunctionState &WST); + + /// \brief Signal termination of OMP execution for generic target entry + /// function. + void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// \brief Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -79,9 +66,23 @@ private: // /// \brief Creates offloading entry for the provided entry ID \a ID, - /// address \a Addr and size \a Size. + /// address \a Addr, size \a Size, and flags \a Flags. void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr, - uint64_t Size) override; + uint64_t Size, int32_t Flags = 0) override; + + /// \brief Emit outlined function specialized for the Fork-Join + /// programming model for applicable target directives on the NVPTX device. + /// \param D Directive to emit. + /// \param ParentName Name of the function that encloses the target region. + /// \param OutlinedFn Outlined function value to be defined by this call. + /// \param OutlinedFnID Outlined function ID value to be defined by this call. + /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// An outlined function may not be an entry if, e.g. the if clause always + /// evaluates to false. + void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + const RegionCodeGenTy &CodeGen); /// \brief Emit outlined function for 'target' directive on the NVPTX /// device. diff --git a/lib/CodeGen/CodeGenAction.cpp b/lib/CodeGen/CodeGenAction.cpp index 1e17918df4a4..5f74141d75b3 100644 --- a/lib/CodeGen/CodeGenAction.cpp +++ b/lib/CodeGen/CodeGenAction.cpp @@ -44,6 +44,7 @@ namespace clang { virtual void anchor(); DiagnosticsEngine &Diags; BackendAction Action; + const HeaderSearchOptions &HeaderSearchOpts; const CodeGenOptions &CodeGenOpts; const TargetOptions &TargetOpts; const LangOptions &LangOpts; @@ -77,8 +78,8 @@ namespace clang { const SmallVectorImpl<std::pair<unsigned, llvm::Module *>> &LinkModules, std::unique_ptr<raw_pwrite_stream> OS, LLVMContext &C, CoverageSourceInfo *CoverageInfo = nullptr) - : Diags(Diags), Action(Action), CodeGenOpts(CodeGenOpts), - TargetOpts(TargetOpts), LangOpts(LangOpts), + : Diags(Diags), Action(Action), HeaderSearchOpts(HeaderSearchOpts), + CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts), LangOpts(LangOpts), AsmOutStream(std::move(OS)), Context(nullptr), LLVMIRGeneration("irgen", "LLVM IR Generation Time"), LLVMIRGenerationRefCount(0), @@ -225,8 +226,8 @@ namespace clang { EmbedBitcode(getModule(), CodeGenOpts, llvm::MemoryBufferRef()); - EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts, - C.getTargetInfo().getDataLayout(), + EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, + LangOpts, C.getTargetInfo().getDataLayout(), getModule(), Action, std::move(AsmOutStream)); Ctx.setInlineAsmDiagnosticHandler(OldHandler, OldContext); @@ -898,9 +899,10 @@ void CodeGenAction::ExecuteAction() { Ctx.setInlineAsmDiagnosticHandler(BitcodeInlineAsmDiagHandler, &CI.getDiagnostics()); - EmitBackendOutput(CI.getDiagnostics(), CI.getCodeGenOpts(), TargetOpts, - CI.getLangOpts(), CI.getTarget().getDataLayout(), - TheModule.get(), BA, std::move(OS)); + EmitBackendOutput(CI.getDiagnostics(), CI.getHeaderSearchOpts(), + CI.getCodeGenOpts(), TargetOpts, CI.getLangOpts(), + CI.getTarget().getDataLayout(), TheModule.get(), BA, + std::move(OS)); return; } diff --git a/lib/CodeGen/CodeGenFunction.h b/lib/CodeGen/CodeGenFunction.h index 1347f54df9ac..05522cd40024 100644 --- a/lib/CodeGen/CodeGenFunction.h +++ b/lib/CodeGen/CodeGenFunction.h @@ -120,7 +120,7 @@ enum TypeEvaluationKind { SANITIZER_CHECK(OutOfBounds, out_of_bounds, 0) \ SANITIZER_CHECK(ShiftOutOfBounds, shift_out_of_bounds, 0) \ SANITIZER_CHECK(SubOverflow, sub_overflow, 0) \ - SANITIZER_CHECK(TypeMismatch, type_mismatch, 0) \ + SANITIZER_CHECK(TypeMismatch, type_mismatch, 1) \ SANITIZER_CHECK(VLABoundNotPositive, vla_bound_not_positive, 0) enum SanitizerHandler { diff --git a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp index baf7811eedaf..754f9968b67f 100644 --- a/lib/CodeGen/ObjectFilePCHContainerOperations.cpp +++ b/lib/CodeGen/ObjectFilePCHContainerOperations.cpp @@ -282,7 +282,7 @@ public: // Print the IR for the PCH container to the debug output. llvm::SmallString<0> Buffer; clang::EmitBackendOutput( - Diags, CodeGenOpts, TargetOpts, LangOpts, + Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, LangOpts, Ctx.getTargetInfo().getDataLayout(), M.get(), BackendAction::Backend_EmitLL, llvm::make_unique<llvm::raw_svector_ostream>(Buffer)); @@ -290,9 +290,10 @@ public: }); // Use the LLVM backend to emit the pch container. - clang::EmitBackendOutput(Diags, CodeGenOpts, TargetOpts, LangOpts, - Ctx.getTargetInfo().getDataLayout(), M.get(), - BackendAction::Backend_EmitObj, std::move(OS)); + clang::EmitBackendOutput(Diags, HeaderSearchOpts, CodeGenOpts, TargetOpts, + LangOpts, Ctx.getTargetInfo().getDataLayout(), + M.get(), BackendAction::Backend_EmitObj, + std::move(OS)); // Free the memory for the temporary buffer. llvm::SmallVector<char, 0> Empty; diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp index 391eb53d2500..d2fc3888ef29 100644 --- a/lib/CodeGen/TargetInfo.cpp +++ b/lib/CodeGen/TargetInfo.cpp @@ -871,6 +871,14 @@ static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) { return NumMembers <= 4; } +/// Returns a Homogeneous Vector Aggregate ABIArgInfo, used in X86. +static ABIArgInfo getDirectX86Hva(llvm::Type* T = nullptr) { + auto AI = ABIArgInfo::getDirect(T); + AI.setInReg(true); + AI.setCanBeFlattened(false); + return AI; +} + //===----------------------------------------------------------------------===// // X86-32 ABI Implementation //===----------------------------------------------------------------------===// @@ -884,6 +892,11 @@ struct CCState { unsigned FreeSSERegs; }; +enum { + // Vectorcall only allows the first 6 parameters to be passed in registers. + VectorcallMaxParamNumAsReg = 6 +}; + /// X86_32ABIInfo - The X86-32 ABI information. class X86_32ABIInfo : public SwiftABIInfo { enum Class { @@ -929,6 +942,8 @@ class X86_32ABIInfo : public SwiftABIInfo { Class classify(QualType Ty) const; ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const; ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const; + ABIArgInfo reclassifyHvaArgType(QualType RetTy, CCState &State, + const ABIArgInfo& current) const; /// \brief Updates the number of available free registers, returns /// true if any registers were allocated. bool updateFreeRegs(QualType Ty, CCState &State) const; @@ -946,6 +961,8 @@ class X86_32ABIInfo : public SwiftABIInfo { void addFieldToArgStruct(SmallVector<llvm::Type *, 6> &FrameFields, CharUnits &StackOffset, ABIArgInfo &Info, QualType Type) const; + void computeVectorCallArgs(CGFunctionInfo &FI, CCState &State, + bool &UsedInAlloca) const; public: @@ -1494,6 +1511,27 @@ bool X86_32ABIInfo::shouldPrimitiveUseInReg(QualType Ty, CCState &State) const { return true; } +ABIArgInfo +X86_32ABIInfo::reclassifyHvaArgType(QualType Ty, CCState &State, + const ABIArgInfo ¤t) const { + // Assumes vectorCall calling convention. + const Type *Base = nullptr; + uint64_t NumElts = 0; + + if (!Ty->isBuiltinType() && !Ty->isVectorType() && + isHomogeneousAggregate(Ty, Base, NumElts)) { + if (State.FreeSSERegs >= NumElts) { + // HVA types get passed directly in registers if there is room. + State.FreeSSERegs -= NumElts; + return getDirectX86Hva(); + } + // If there's no room, the HVA gets passed as normal indirect + // structure. + return getIndirectResult(Ty, /*ByVal=*/false, State); + } + return current; +} + ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State) const { // FIXME: Set alignment on indirect arguments. @@ -1513,19 +1551,34 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, } // vectorcall adds the concept of a homogenous vector aggregate, similar - // to other targets. + // to other targets, regcall uses some of the HVA rules. const Type *Base = nullptr; uint64_t NumElts = 0; if ((State.CC == llvm::CallingConv::X86_VectorCall || State.CC == llvm::CallingConv::X86_RegCall) && isHomogeneousAggregate(Ty, Base, NumElts)) { - if (State.FreeSSERegs >= NumElts) { - State.FreeSSERegs -= NumElts; - if (Ty->isBuiltinType() || Ty->isVectorType()) + + if (State.CC == llvm::CallingConv::X86_RegCall) { + if (State.FreeSSERegs >= NumElts) { + State.FreeSSERegs -= NumElts; + if (Ty->isBuiltinType() || Ty->isVectorType()) + return ABIArgInfo::getDirect(); + return ABIArgInfo::getExpand(); + + } + return getIndirectResult(Ty, /*ByVal=*/false, State); + } else if (State.CC == llvm::CallingConv::X86_VectorCall) { + if (State.FreeSSERegs >= NumElts && (Ty->isBuiltinType() || Ty->isVectorType())) { + // Actual floating-point types get registers first time through if + // there is registers available + State.FreeSSERegs -= NumElts; return ABIArgInfo::getDirect(); - return ABIArgInfo::getExpand(); + } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) { + // HVA Types only get registers after everything else has been + // set, so it gets set as indirect for now. + return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty)); + } } - return getIndirectResult(Ty, /*ByVal=*/false, State); } if (isAggregateTypeForABI(Ty)) { @@ -1604,6 +1657,36 @@ ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, return ABIArgInfo::getDirect(); } +void X86_32ABIInfo::computeVectorCallArgs(CGFunctionInfo &FI, CCState &State, + bool &UsedInAlloca) const { + // Vectorcall only allows the first 6 parameters to be passed in registers, + // and homogeneous vector aggregates are only put into registers as a second + // priority. + unsigned Count = 0; + CCState ZeroState = State; + ZeroState.FreeRegs = ZeroState.FreeSSERegs = 0; + // HVAs must be done as a second priority for registers, so the deferred + // items are dealt with by going through the pattern a second time. + for (auto &I : FI.arguments()) { + if (Count < VectorcallMaxParamNumAsReg) + I.info = classifyArgumentType(I.type, State); + else + // Parameters after the 6th cannot be passed in registers, + // so pretend there are no registers left for them. + I.info = classifyArgumentType(I.type, ZeroState); + UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); + ++Count; + } + Count = 0; + // Go through the arguments a second time to get HVAs registers if there + // are still some available. + for (auto &I : FI.arguments()) { + if (Count < VectorcallMaxParamNumAsReg) + I.info = reclassifyHvaArgType(I.type, State, I.info); + ++Count; + } +} + void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const { CCState State(FI.getCallingConvention()); if (IsMCUABI) @@ -1638,9 +1721,14 @@ void X86_32ABIInfo::computeInfo(CGFunctionInfo &FI) const { ++State.FreeRegs; bool UsedInAlloca = false; - for (auto &I : FI.arguments()) { - I.info = classifyArgumentType(I.type, State); - UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); + if (State.CC == llvm::CallingConv::X86_VectorCall) { + computeVectorCallArgs(FI, State, UsedInAlloca); + } else { + // If not vectorcall, revert to normal behavior. + for (auto &I : FI.arguments()) { + I.info = classifyArgumentType(I.type, State); + UsedInAlloca |= (I.info.getKind() == ABIArgInfo::InAlloca); + } } // If we needed to use inalloca for any argument, do a second pass and rewrite @@ -2070,10 +2158,14 @@ public: } private: - ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs, - bool IsReturnType) const; - - bool IsMingw64; + ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs, bool IsReturnType, + bool IsVectorCall, bool IsRegCall) const; + ABIArgInfo reclassifyHvaArgType(QualType Ty, unsigned &FreeSSERegs, + const ABIArgInfo ¤t) const; + void computeVectorCallArgs(CGFunctionInfo &FI, unsigned FreeSSERegs, + bool IsVectorCall, bool IsRegCall) const; + + bool IsMingw64; }; class X86_64TargetCodeGenInfo : public TargetCodeGenInfo { @@ -3679,8 +3771,24 @@ Address X86_64ABIInfo::EmitMSVAArg(CodeGenFunction &CGF, Address VAListAddr, /*allowHigherAlign*/ false); } +ABIArgInfo +WinX86_64ABIInfo::reclassifyHvaArgType(QualType Ty, unsigned &FreeSSERegs, + const ABIArgInfo ¤t) const { + // Assumes vectorCall calling convention. + const Type *Base = nullptr; + uint64_t NumElts = 0; + + if (!Ty->isBuiltinType() && !Ty->isVectorType() && + isHomogeneousAggregate(Ty, Base, NumElts) && FreeSSERegs >= NumElts) { + FreeSSERegs -= NumElts; + return getDirectX86Hva(); + } + return current; +} + ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs, - bool IsReturnType) const { + bool IsReturnType, bool IsVectorCall, + bool IsRegCall) const { if (Ty->isVoidType()) return ABIArgInfo::getIgnore(); @@ -3704,21 +3812,34 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs, } - // vectorcall adds the concept of a homogenous vector aggregate, similar to - // other targets. const Type *Base = nullptr; uint64_t NumElts = 0; - if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) { - if (FreeSSERegs >= NumElts) { - FreeSSERegs -= NumElts; - if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType()) + // vectorcall adds the concept of a homogenous vector aggregate, similar to + // other targets. + if ((IsVectorCall || IsRegCall) && + isHomogeneousAggregate(Ty, Base, NumElts)) { + if (IsRegCall) { + if (FreeSSERegs >= NumElts) { + FreeSSERegs -= NumElts; + if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType()) + return ABIArgInfo::getDirect(); + return ABIArgInfo::getExpand(); + } + return ABIArgInfo::getIndirect(Align, /*ByVal=*/false); + } else if (IsVectorCall) { + if (FreeSSERegs >= NumElts && + (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())) { + FreeSSERegs -= NumElts; return ABIArgInfo::getDirect(); - return ABIArgInfo::getExpand(); + } else if (IsReturnType) { + return ABIArgInfo::getExpand(); + } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) { + // HVAs are delayed and reclassified in the 2nd step. + return ABIArgInfo::getIndirect(Align, /*ByVal=*/false); + } } - return ABIArgInfo::getIndirect(Align, /*ByVal=*/false); } - if (Ty->isMemberPointerType()) { // If the member pointer is represented by an LLVM int or ptr, pass it // directly. @@ -3754,6 +3875,32 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs, return ABIArgInfo::getDirect(); } +void WinX86_64ABIInfo::computeVectorCallArgs(CGFunctionInfo &FI, + unsigned FreeSSERegs, + bool IsVectorCall, + bool IsRegCall) const { + unsigned Count = 0; + for (auto &I : FI.arguments()) { + if (Count < VectorcallMaxParamNumAsReg) + I.info = classify(I.type, FreeSSERegs, false, IsVectorCall, IsRegCall); + else { + // Since these cannot be passed in registers, pretend no registers + // are left. + unsigned ZeroSSERegsAvail = 0; + I.info = classify(I.type, /*FreeSSERegs=*/ZeroSSERegsAvail, false, + IsVectorCall, IsRegCall); + } + ++Count; + } + + Count = 0; + for (auto &I : FI.arguments()) { + if (Count < VectorcallMaxParamNumAsReg) + I.info = reclassifyHvaArgType(I.type, FreeSSERegs, I.info); + ++Count; + } +} + void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { bool IsVectorCall = FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall; @@ -3769,17 +3916,24 @@ void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const { } if (!getCXXABI().classifyReturnType(FI)) - FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true); + FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true, + IsVectorCall, IsRegCall); if (IsVectorCall) { // We can use up to 6 SSE register parameters with vectorcall. FreeSSERegs = 6; } else if (IsRegCall) { + // RegCall gives us 16 SSE registers, we can reuse the return registers. FreeSSERegs = 16; } - for (auto &I : FI.arguments()) - I.info = classify(I.type, FreeSSERegs, false); + if (IsVectorCall) { + computeVectorCallArgs(FI, FreeSSERegs, IsVectorCall, IsRegCall); + } else { + for (auto &I : FI.arguments()) + I.info = classify(I.type, FreeSSERegs, false, IsVectorCall, IsRegCall); + } + } Address WinX86_64ABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, |