src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2022-03-20 11:40:34 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2022-05-14 11:43:05 +0000
commit	349cc55c9796c4596a5b9904cd3281af295f878f (patch)
tree	410c5a785075730a35f1272ca6a7adf72222ad03 /contrib/llvm-project/llvm/include/llvm/Frontend
parent	cb2ae6163174b90e999326ecec3699ee093a5d43 (diff)
parent	c0981da47d5696fe36474fcf86b4ce03ae3ff818 (diff)
download	src-349cc55c9796c4596a5b9904cd3281af295f878f.tar.gz src-349cc55c9796c4596a5b9904cd3281af295f878f.zip

Diffstat (limited to 'contrib/llvm-project/llvm/include/llvm/Frontend')

-rw-r--r--

contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td

-rw-r--r--

contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

-rw-r--r--

contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h

117

-rw-r--r--

contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

430

-rw-r--r--

contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

5 files changed, 544 insertions, 143 deletions

diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 3dc6194c7830..5ee379b7fcad 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td

@@ -144,6 +144,26 @@ def OMPC_Schedule : Clause<"schedule"> {

];

}

+def OMP_MEMORY_ORDER_SeqCst : ClauseVal<"seq_cst", 1, 1> {}

+def OMP_MEMORY_ORDER_AcqRel : ClauseVal<"acq_rel", 2, 1> {}

+def OMP_MEMORY_ORDER_Acquire : ClauseVal<"acquire", 3, 1> {}

+def OMP_MEMORY_ORDER_Release : ClauseVal<"release", 4, 1> {}

+def OMP_MEMORY_ORDER_Relaxed : ClauseVal<"relaxed", 5, 1> {}

+def OMP_MEMORY_ORDER_Default : ClauseVal<"default", 6, 0> {

+ let isDefault = 1;

+def OMPC_MemoryOrder : Clause<"memory_order"> {

+ let enumClauseValue = "MemoryOrderKind";

+ let allowedClauseValues = [

+ OMP_MEMORY_ORDER_SeqCst,

+ OMP_MEMORY_ORDER_AcqRel,

+ OMP_MEMORY_ORDER_Acquire,

+ OMP_MEMORY_ORDER_Release,

+ OMP_MEMORY_ORDER_Relaxed,

+ OMP_MEMORY_ORDER_Default

+ ];

def OMPC_Ordered : Clause<"ordered"> {

let clangClass = "OMPOrderedClause";

let flangClass = "ScalarIntConstantExpr";

@@ -261,13 +281,17 @@ def OMPC_Allocate : Clause<"allocate"> {

}

def OMPC_NonTemporal : Clause<"nontemporal"> {

let clangClass = "OMPNontemporalClause";

+ let flangClass = "Name";

+ let isValueList = true;

}

-def OMP_ORDER_concurrent : ClauseVal<"default",2,0> { let isDefault = 1; }

+def OMP_ORDER_concurrent : ClauseVal<"concurrent",1,1> {}

+def OMP_ORDER_unknown : ClauseVal<"unknown",2,0> { let isDefault = 1; }

def OMPC_Order : Clause<"order"> {

let clangClass = "OMPOrderClause";

let enumClauseValue = "OrderKind";

let allowedClauseValues = [

+ OMP_ORDER_unknown,

OMP_ORDER_concurrent

];

}

@@ -312,6 +336,8 @@ def OMPC_Uniform : Clause<"uniform"> {

}

def OMPC_DeviceType : Clause<"device_type"> {}

def OMPC_Match : Clause<"match"> {}

+def OMPC_AdjustArgs : Clause<"adjust_args"> { }

+def OMPC_AppendArgs : Clause<"append_args"> { }

def OMPC_Depobj : Clause<"depobj"> {

let clangClass = "OMPDepobjClause";

let isImplicit = true;

@@ -337,6 +363,14 @@ def OMPC_Filter : Clause<"filter"> {

let clangClass = "OMPFilterClause";

let flangClass = "ScalarIntExpr";

}

+def OMPC_Align : Clause<"align"> {

+ let clangClass = "OMPAlignClause";

+def OMPC_When: Clause<"when"> {}

+def OMPC_Bind : Clause<"bind"> {

+ let clangClass = "OMPBindClause";

//===----------------------------------------------------------------------===//

// Definition of OpenMP directives

@@ -473,8 +507,8 @@ def OMP_TaskWait : Directive<"taskwait"> {

}

def OMP_TaskGroup : Directive<"taskgroup"> {

let allowedClauses = [

- VersionedClause<OMPC_TaskReduction>,

- VersionedClause<OMPC_Allocate>

+ VersionedClause<OMPC_TaskReduction, 50>,

+ VersionedClause<OMPC_Allocate, 50>

];

}

def OMP_Flush : Directive<"flush"> {

@@ -489,10 +523,12 @@ def OMP_Flush : Directive<"flush"> {

}

def OMP_Ordered : Directive<"ordered"> {

let allowedClauses = [

- VersionedClause<OMPC_Threads>,

- VersionedClause<OMPC_Simd>,

VersionedClause<OMPC_Depend>

];

+ let allowedOnceClauses = [

+ VersionedClause<OMPC_Threads>,

+ VersionedClause<OMPC_Simd>

+ ];

}

def OMP_Atomic : Directive<"atomic"> {

let allowedClauses = [

@@ -1506,13 +1542,18 @@ def OMP_TargetTeamsDistributeSimd :

}

def OMP_Allocate : Directive<"allocate"> {

let allowedOnceClauses = [

- VersionedClause<OMPC_Allocator>

+ VersionedClause<OMPC_Allocator>,

+ VersionedClause<OMPC_Align, 51>

];

}

def OMP_DeclareVariant : Directive<"declare variant"> {

let allowedClauses = [

VersionedClause<OMPC_Match>

];

+ let allowedExclusiveClauses = [

+ VersionedClause<OMPC_AdjustArgs, 51>,

+ VersionedClause<OMPC_AppendArgs, 51>

+ ];

}

def OMP_MasterTaskloop : Directive<"master taskloop"> {

let allowedClauses = [

@@ -1699,6 +1740,22 @@ def OMP_masked : Directive<"masked"> {

VersionedClause<OMPC_Filter>

];

}

+def OMP_loop : Directive<"loop"> {

+ let allowedClauses = [

+ VersionedClause<OMPC_LastPrivate>,

+ VersionedClause<OMPC_Private>,

+ VersionedClause<OMPC_Reduction>,

+ ];

+ let allowedOnceClauses = [

+ VersionedClause<OMPC_Bind, 50>,

+ VersionedClause<OMPC_Collapse>,

+ VersionedClause<OMPC_Order>,

+ ];

+def OMP_Metadirective : Directive<"metadirective"> {

+ let allowedClauses = [VersionedClause<OMPC_When>];

+ let allowedOnceClauses = [VersionedClause<OMPC_Default>];

def OMP_Unknown : Directive<"unknown"> {

let isDefault = true;

}

diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index d174cc8992dd..2fec3e7e4230 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h

@@ -128,6 +128,14 @@ enum class OMPScheduleType {

LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)

};

+enum OMPTgtExecModeFlags : int8_t {

+ OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,

+ OMP_TGT_EXEC_MODE_SPMD = 1 << 1,

+ OMP_TGT_EXEC_MODE_GENERIC_SPMD =

+ OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,

+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)

+};

} // end namespace omp

} // end namespace llvm

diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index 0b6aed1e9e12..89f5de229b3b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h

@@ -29,100 +29,89 @@ namespace omp {

///

/// Example usage in clang:

/// const unsigned slot_size =

-/// ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);

+/// ctx.GetTargetInfo().getGridValue().GV_Warp_Size;

///

/// Example usage in libomptarget/deviceRTLs:

/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"

/// #ifdef __AMDGPU__

-/// #define GRIDVAL AMDGPUGpuGridValues

+/// #define GRIDVAL AMDGPUGridValues

/// #else

-/// #define GRIDVAL NVPTXGpuGridValues

+/// #define GRIDVAL NVPTXGridValues

/// #endif

/// ... Then use this reference for GV_Warp_Size in the deviceRTL source.

-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]

+/// llvm::omp::GRIDVAL().GV_Warp_Size

///

/// Example usage in libomptarget hsa plugin:

/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"

-/// #define GRIDVAL AMDGPUGpuGridValues

+/// #define GRIDVAL AMDGPUGridValues

/// ... Then use this reference to access GV_Warp_Size in the hsa plugin.

-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]

+/// llvm::omp::GRIDVAL().GV_Warp_Size

///

/// Example usage in libomptarget cuda plugin:

/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"

-/// #define GRIDVAL NVPTXGpuGridValues

+/// #define GRIDVAL NVPTXGridValues

/// ... Then use this reference to access GV_Warp_Size in the cuda plugin.

-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]

+/// llvm::omp::GRIDVAL().GV_Warp_Size

///

-enum GVIDX {

- /// The maximum number of workers in a kernel.

- /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z

- GV_Threads,

+struct GV {

/// The size reserved for data in a shared memory slot.

- GV_Slot_Size,

+ const unsigned GV_Slot_Size;

/// The default value of maximum number of threads in a worker warp.

- GV_Warp_Size,

- /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size

- /// for NVPTX.

- GV_Warp_Size_32,

- /// The number of bits required to represent the max number of threads in warp

- GV_Warp_Size_Log2,

- /// GV_Warp_Size * GV_Slot_Size,

- GV_Warp_Slot_Size,

+ const unsigned GV_Warp_Size;

+ constexpr unsigned warpSlotSize() const {

+ return GV_Warp_Size * GV_Slot_Size;

+ }

/// the maximum number of teams.

- GV_Max_Teams,

- /// Global Memory Alignment

- GV_Mem_Align,

- /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))

- GV_Warp_Size_Log2_Mask,

+ const unsigned GV_Max_Teams;

// An alternative to the heavy data sharing infrastructure that uses global

// memory is one that uses device __shared__ memory. The amount of such space

// (in bytes) reserved by the OpenMP runtime is noted here.

- GV_SimpleBufferSize,

+ const unsigned GV_SimpleBufferSize;

// The absolute maximum team size for a working group

- GV_Max_WG_Size,

+ const unsigned GV_Max_WG_Size;

// The default maximum team size for a working group

- GV_Default_WG_Size,

- // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.

- GV_Max_Warp_Number,

- /// The slot size that should be reserved for a working warp.

- /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))

- GV_Warp_Size_Log2_MaskL

+ const unsigned GV_Default_WG_Size;

+ constexpr unsigned maxWarpNumber() const {

+ return GV_Max_WG_Size / GV_Warp_Size;

+ }

};

/// For AMDGPU GPUs

-static constexpr unsigned AMDGPUGpuGridValues[] = {

- 448, // GV_Threads

- 256, // GV_Slot_Size

- 64, // GV_Warp_Size

- 32, // GV_Warp_Size_32

- 6, // GV_Warp_Size_Log2

- 64 * 256, // GV_Warp_Slot_Size

- 128, // GV_Max_Teams

- 256, // GV_Mem_Align

- 63, // GV_Warp_Size_Log2_Mask

- 896, // GV_SimpleBufferSize

- 1024, // GV_Max_WG_Size,

- 256, // GV_Defaut_WG_Size

- 1024 / 64, // GV_Max_WG_Size / GV_WarpSize

- 63 // GV_Warp_Size_Log2_MaskL

+static constexpr GV AMDGPUGridValues64 = {

+ 256, // GV_Slot_Size

+ 64, // GV_Warp_Size

+ 128, // GV_Max_Teams

+ 896, // GV_SimpleBufferSize

+ 1024, // GV_Max_WG_Size,

+ 256, // GV_Default_WG_Size

};

+static constexpr GV AMDGPUGridValues32 = {

+ 256, // GV_Slot_Size

+ 32, // GV_Warp_Size

+ 128, // GV_Max_Teams

+ 896, // GV_SimpleBufferSize

+ 1024, // GV_Max_WG_Size,

+ 256, // GV_Default_WG_Size

+};

+template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {

+ static_assert(wavesize == 32 || wavesize == 64, "");

+ return wavesize == 32 ? AMDGPUGridValues32 : AMDGPUGridValues64;

/// For Nvidia GPUs

-static constexpr unsigned NVPTXGpuGridValues[] = {

- 992, // GV_Threads

- 256, // GV_Slot_Size

- 32, // GV_Warp_Size

- 32, // GV_Warp_Size_32

- 5, // GV_Warp_Size_Log2

- 32 * 256, // GV_Warp_Slot_Size

- 1024, // GV_Max_Teams

- 256, // GV_Mem_Align

- (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask

- 896, // GV_SimpleBufferSize

- 1024, // GV_Max_WG_Size

- 128, // GV_Defaut_WG_Size

- 1024 / 32, // GV_Max_WG_Size / GV_WarpSize

- 31 // GV_Warp_Size_Log2_MaskL

+static constexpr GV NVPTXGridValues = {

+ 256, // GV_Slot_Size

+ 32, // GV_Warp_Size

+ 1024, // GV_Max_Teams

+ 896, // GV_SimpleBufferSize

+ 1024, // GV_Max_WG_Size

+ 128, // GV_Default_WG_Size

};

} // namespace omp

diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8144f1527a06..563e0eed1762 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

@@ -257,18 +257,17 @@ public:

///

/// * Sign of the step and the comparison operator might disagree:

///

- /// for (int i = 0; i < 42; --i)

+ /// for (int i = 0; i < 42; i -= 1u)

///

/// \param Loc The insert and source location description.

/// \param BodyGenCB Callback that will generate the loop body code.

/// \param Start Value of the loop counter for the first iterations.

- /// \param Stop Loop counter values past this will stop the the

- /// iterations.

+ /// \param Stop Loop counter values past this will stop the loop.

/// \param Step Loop counter increment after each iteration; negative

- /// means counting down. \param IsSigned Whether Start, Stop

- /// and Stop are signed integers.

- /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop

+ /// means counting down.

+ /// \param IsSigned Whether Start, Stop and Step are signed integers.

+ /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop

/// counter.

/// \param ComputeIP Insertion point for instructions computing the trip

/// count. Can be used to ensure the trip count is available

@@ -335,7 +334,7 @@ public:

/// has a trip count of 0). This is permitted by the OpenMP specification.

///

/// \param DL Debug location for instructions added for collapsing,

- /// such as instructions to compute derive the input loop's

+ /// such as instructions to compute/derive the input loop's

/// induction variables.

/// \param Loops Loops in the loop nest to collapse. Loops are specified

/// from outermost-to-innermost and every control flow of a

@@ -358,8 +357,16 @@ public:

/// the current thread, updates the relevant instructions in the canonical

/// loop and calls to an OpenMP runtime finalization function after the loop.

///

- /// \param Loc The source location description, the insertion location

- /// is not used.

+ /// TODO: Workshare loops with static scheduling may contain up to two loops

+ /// that fulfill the requirements of an OpenMP canonical loop. One for

+ /// iterating over all iterations of a chunk and another one for iterating

+ /// over all chunks that are executed on the same thread. Returning

+ /// CanonicalLoopInfo objects representing them may eventually be useful for

+ /// the apply clause planned in OpenMP 6.0, but currently whether these are

+ /// canonical loops is irrelevant.

+ ///

+ /// \param DL Debug location for instructions added for the

+ /// workshare-loop construct itself.

/// \param CLI A descriptor of the canonical loop to workshare.

/// \param AllocaIP An insertion point for Alloca instructions usable in the

/// preheader of the loop.

@@ -368,12 +375,11 @@ public:

/// \param Chunk The size of loop chunk considered as a unit when

/// scheduling. If \p nullptr, defaults to 1.

///

- /// \returns Updated CanonicalLoopInfo.

- CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,

- CanonicalLoopInfo *CLI,

- InsertPointTy AllocaIP,

- bool NeedsBarrier,

- Value *Chunk = nullptr);

+ /// \returns Point where to insert code after the workshare construct.

+ InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,

+ InsertPointTy AllocaIP,

+ bool NeedsBarrier,

+ Value *Chunk = nullptr);

/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.

///

@@ -382,8 +388,9 @@ public:

/// turn it into a workshare loop. In particular, it calls to an OpenMP

/// runtime function in the preheader to obtain, and then in each iteration

/// to update the loop counter.

- /// \param Loc The source location description, the insertion location

- /// is not used.

+ ///

+ /// \param DL Debug location for instructions added for the

+ /// workshare-loop construct itself.

/// \param CLI A descriptor of the canonical loop to workshare.

/// \param AllocaIP An insertion point for Alloca instructions usable in the

/// preheader of the loop.

@@ -393,13 +400,12 @@ public:

/// \param Chunk The size of loop chunk considered as a unit when

/// scheduling. If \p nullptr, defaults to 1.

///

- /// \returns Point where to insert code after the loop.

- InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,

- CanonicalLoopInfo *CLI,

- InsertPointTy AllocaIP,

- omp::OMPScheduleType SchedType,

- bool NeedsBarrier,

- Value *Chunk = nullptr);

+ /// \returns Point where to insert code after the workshare construct.

+ InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,

+ InsertPointTy AllocaIP,

+ omp::OMPScheduleType SchedType,

+ bool NeedsBarrier,

+ Value *Chunk = nullptr);

/// Modifies the canonical loop to be a workshare loop.

///

@@ -410,19 +416,17 @@ public:

/// the current thread, updates the relevant instructions in the canonical

/// loop and calls to an OpenMP runtime finalization function after the loop.

///

- /// \param Loc The source location description, the insertion location

- /// is not used.

+ /// \param DL Debug location for instructions added for the

+ /// workshare-loop construct itself.

/// \param CLI A descriptor of the canonical loop to workshare.

/// \param AllocaIP An insertion point for Alloca instructions usable in the

/// preheader of the loop.

/// \param NeedsBarrier Indicates whether a barrier must be insterted after

/// the loop.

///

- /// \returns Updated CanonicalLoopInfo.

- CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc,

- CanonicalLoopInfo *CLI,

- InsertPointTy AllocaIP,

- bool NeedsBarrier);

+ /// \returns Point where to insert code after the workshare construct.

+ InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,

+ InsertPointTy AllocaIP, bool NeedsBarrier);

/// Tile a loop nest.

///

@@ -471,6 +475,48 @@ public:

tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,

ArrayRef<Value *> TileSizes);

+ /// Fully unroll a loop.

+ ///

+ /// Instead of unrolling the loop immediately (and duplicating its body

+ /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop

+ /// metadata.

+ ///

+ /// \param DL Debug location for instructions added by unrolling.

+ /// \param Loop The loop to unroll. The loop will be invalidated.

+ void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);

+ /// Fully or partially unroll a loop. How the loop is unrolled is determined

+ /// using LLVM's LoopUnrollPass.

+ ///

+ /// \param DL Debug location for instructions added by unrolling.

+ /// \param Loop The loop to unroll. The loop will be invalidated.

+ void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);

+ /// Partially unroll a loop.

+ ///

+ /// The CanonicalLoopInfo of the unrolled loop for use with chained

+ /// loop-associated directive can be requested using \p UnrolledCLI. Not

+ /// needing the CanonicalLoopInfo allows more efficient code generation by

+ /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.

+ /// A loop-associated directive applied to the unrolled loop needs to know the

+ /// new trip count which means that if using a heuristically determined unroll

+ /// factor (\p Factor == 0), that factor must be computed immediately. We are

+ /// using the same logic as the LoopUnrollPass to derived the unroll factor,

+ /// but which assumes that some canonicalization has taken place (e.g.

+ /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform

+ /// better when the unrolled loop's CanonicalLoopInfo is not needed.

+ ///

+ /// \param DL Debug location for instructions added by unrolling.

+ /// \param Loop The loop to unroll. The loop will be invalidated.

+ /// \param Factor The factor to unroll the loop by. A factor of 0

+ /// indicates that a heuristic should be used to determine

+ /// the unroll-factor.

+ /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the

+ /// partially unrolled loop. Otherwise, uses loop metadata

+ /// to defer unrolling to the LoopUnrollPass.

+ void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,

+ CanonicalLoopInfo **UnrolledCLI);

/// Generator for '#omp flush'

///

/// \param Loc The location where the flush directive was encountered

@@ -486,6 +532,115 @@ public:

/// \param Loc The location where the taskyield directive was encountered.

void createTaskyield(const LocationDescription &Loc);

+ /// Functions used to generate reductions. Such functions take two Values

+ /// representing LHS and RHS of the reduction, respectively, and a reference

+ /// to the value that is updated to refer to the reduction result.

+ using ReductionGenTy =

+ function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;

+ /// Functions used to generate atomic reductions. Such functions take two

+ /// Values representing pointers to LHS and RHS of the reduction. They are

+ /// expected to atomically update the LHS to the reduced value.

+ using AtomicReductionGenTy =

+ function_ref<InsertPointTy(InsertPointTy, Value *, Value *)>;

+ /// Information about an OpenMP reduction.

+ struct ReductionInfo {

+ ReductionInfo(Value *Variable, Value *PrivateVariable,

+ ReductionGenTy ReductionGen,

+ AtomicReductionGenTy AtomicReductionGen)

+ : Variable(Variable), PrivateVariable(PrivateVariable),

+ ReductionGen(ReductionGen), AtomicReductionGen(AtomicReductionGen) {}

+ /// Returns the type of the element being reduced.

+ Type *getElementType() const {

+ return Variable->getType()->getPointerElementType();

+ }

+ /// Reduction variable of pointer type.

+ Value *Variable;

+ /// Thread-private partial reduction variable.

+ Value *PrivateVariable;

+ /// Callback for generating the reduction body. The IR produced by this will

+ /// be used to combine two values in a thread-safe context, e.g., under

+ /// lock or within the same thread, and therefore need not be atomic.

+ ReductionGenTy ReductionGen;

+ /// Callback for generating the atomic reduction body, may be null. The IR

+ /// produced by this will be used to atomically combine two values during

+ /// reduction. If null, the implementation will use the non-atomic version

+ /// along with the appropriate synchronization mechanisms.

+ AtomicReductionGenTy AtomicReductionGen;

+ };

+ // TODO: provide atomic and non-atomic reduction generators for reduction

+ // operators defined by the OpenMP specification.

+ /// Generator for '#omp reduction'.

+ ///

+ /// Emits the IR instructing the runtime to perform the specific kind of

+ /// reductions. Expects reduction variables to have been privatized and

+ /// initialized to reduction-neutral values separately. Emits the calls to

+ /// runtime functions as well as the reduction function and the basic blocks

+ /// performing the reduction atomically and non-atomically.

+ ///

+ /// The code emitted for the following:

+ ///

+ /// \code

+ /// type var_1;

+ /// type var_2;

+ /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)

+ /// /* body */;

+ /// \endcode

+ ///

+ /// corresponds to the following sketch.

+ ///

+ /// \code

+ /// void _outlined_par() {

+ /// // N is the number of different reductions.

+ /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};

+ /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,

+ /// _omp_reduction_func,

+ /// _gomp_critical_user.reduction.var)) {

+ /// case 1: {

+ /// var_1 = var_1 <reduction-op> privatized_var_1;

+ /// var_2 = var_2 <reduction-op> privatized_var_2;

+ /// // ...

+ /// __kmpc_end_reduce(...);

+ /// break;

+ /// }

+ /// case 2: {

+ /// _Atomic<ReductionOp>(var_1, privatized_var_1);

+ /// _Atomic<ReductionOp>(var_2, privatized_var_2);

+ /// // ...

+ /// break;

+ /// }

+ /// default: break;

+ /// }

+ ///

+ /// void _omp_reduction_func(void **lhs, void **rhs) {

+ /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];

+ /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];

+ /// // ...

+ /// }

+ /// \endcode

+ ///

+ /// \param Loc The location where the reduction was

+ /// encountered. Must be within the associate

+ /// directive and after the last local access to the

+ /// reduction variables.

+ /// \param AllocaIP An insertion point suitable for allocas usable

+ /// in reductions.

+ /// \param ReductionInfos A list of info on each reduction variable.

+ /// \param IsNoWait A flag set if the reduction is marked as nowait.

+ InsertPointTy createReductions(const LocationDescription &Loc,

+ InsertPointTy AllocaIP,

+ ArrayRef<ReductionInfo> ReductionInfos,

+ bool IsNoWait = false);

///}

/// Return the insertion point used by the underlying IRBuilder.

@@ -515,6 +670,10 @@ public:

Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,

unsigned Line, unsigned Column);

+ /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as

+ /// fallback if \p DL does not specify the function name.

+ Constant *getOrCreateSrcLocStr(DebugLoc DL, Function *F = nullptr);

/// Return the (LLVM-IR) string describing the source location \p Loc.

Constant *getOrCreateSrcLocStr(const LocationDescription &Loc);

@@ -524,8 +683,8 @@ public:

omp::IdentFlag Flags = omp::IdentFlag(0),

unsigned Reserve2Flags = 0);

- // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL

- Type *getLanemaskType();

+ /// Create a global flag \p Namein the module with initial value \p Value.

+ GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);

/// Generate control flow and cleanup for cancellation.

///

@@ -651,11 +810,11 @@ public:

/// \param Loc The source location description.

/// \param MapperFunc Function to be called.

/// \param SrcLocInfo Source location information global.

- /// \param MaptypesArgs

- /// \param MapnamesArg

+ /// \param MaptypesArg The argument types.

+ /// \param MapnamesArg The argument names.

/// \param MapperAllocas The AllocaInst used for the call.

/// \param DeviceID Device ID for the call.

- /// \param TotalNbOperand Number of operand in the call.

+ /// \param NumOperands Number of operands in the call.

void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,

Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,

struct MapperAllocas &MapperAllocas, int64_t DeviceID,

@@ -705,7 +864,7 @@ public:

/// \param BodyGenCB Callback that will generate the region code.

/// \param FiniCB Callback to finialize variable copies.

///

- /// \returns The insertion position *after* the master.

+ /// \returns The insertion position *after* the masked.

InsertPointTy createMasked(const LocationDescription &Loc,

BodyGenCallbackTy BodyGenCB,

FinalizeCallbackTy FiniCB, Value *Filter);

@@ -718,12 +877,41 @@ public:

/// \param CriticalName name of the lock used by the critical directive

/// \param HintInst Hint Instruction for hint clause associated with critical

///

- /// \returns The insertion position *after* the master.

+ /// \returns The insertion position *after* the critical.

InsertPointTy createCritical(const LocationDescription &Loc,

BodyGenCallbackTy BodyGenCB,

FinalizeCallbackTy FiniCB,

StringRef CriticalName, Value *HintInst);

+ /// Generator for '#omp ordered depend (source | sink)'

+ ///

+ /// \param Loc The insert and source location description.

+ /// \param AllocaIP The insertion point to be used for alloca instructions.

+ /// \param NumLoops The number of loops in depend clause.

+ /// \param StoreValues The value will be stored in vector address.

+ /// \param Name The name of alloca instruction.

+ /// \param IsDependSource If true, depend source; otherwise, depend sink.

+ ///

+ /// \return The insertion position *after* the ordered.

+ InsertPointTy createOrderedDepend(const LocationDescription &Loc,

+ InsertPointTy AllocaIP, unsigned NumLoops,

+ ArrayRef<llvm::Value *> StoreValues,

+ const Twine &Name, bool IsDependSource);

+ /// Generator for '#omp ordered [threads | simd]'

+ ///

+ /// \param Loc The insert and source location description.

+ /// \param BodyGenCB Callback that will generate the region code.

+ /// \param FiniCB Callback to finalize variable copies.

+ /// \param IsThreads If true, with threads clause or without clause;

+ /// otherwise, with simd clause;

+ ///

+ /// \returns The insertion position *after* the ordered.

+ InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,

+ BodyGenCallbackTy BodyGenCB,

+ FinalizeCallbackTy FiniCB,

+ bool IsThreads);

/// Generator for '#omp sections'

///

/// \param Loc The insert and source location description.

@@ -816,14 +1004,16 @@ public:

/// \param Loc The insert and source location description.

/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.

/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.

- InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);

+ InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,

+ bool RequiresFullRuntime);

/// Create a runtime call for kmpc_target_deinit

///

/// \param Loc The insert and source location description.

/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.

/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.

- void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);

+ void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD,

+ bool RequiresFullRuntime);

///}

@@ -1121,7 +1311,25 @@ public:

/// The control-flow structure is standardized for easy consumption by

/// directives associated with loops. For instance, the worksharing-loop

/// construct may change this control flow such that each loop iteration is

-/// executed on only one thread.

+/// executed on only one thread. The constraints of a canonical loop in brief

+/// are:

+///

+/// * The number of loop iterations must have been computed before entering the

+/// loop.

+///

+/// * Has an (unsigned) logical induction variable that starts at zero and

+/// increments by one.

+///

+/// * The loop's CFG itself has no side-effects. The OpenMP specification

+/// itself allows side-effects, but the order in which they happen, including

+/// how often or whether at all, is unspecified. We expect that the frontend

+/// will emit those side-effect instructions somewhere (e.g. before the loop)

+/// such that the CanonicalLoopInfo itself can be side-effect free.

+///

+/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated

+/// execution of a loop body that satifies these constraints. It does NOT

+/// represent arbitrary SESE regions that happen to contain a loop. Do not use

+/// CanonicalLoopInfo for such purposes.

///

/// The control flow can be described as follows:

///

@@ -1141,73 +1349,149 @@ public:

/// |

/// After

///

-/// Code in the header, condition block, latch and exit block must not have any

-/// side-effect. The body block is the single entry point into the loop body,

-/// which may contain arbitrary control flow as long as all control paths

-/// eventually branch to the latch block.

+/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,

+/// including) and end at AfterIP (at the After's first instruction, excluding).

+/// That is, instructions in the Preheader and After blocks (except the

+/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have

+/// side-effects. Typically, the Preheader is used to compute the loop's trip

+/// count. The instructions from BodyIP (at the Body block's first instruction,

+/// excluding) until the Latch are also considered outside CanonicalLoopInfo's

+/// control and thus can have side-effects. The body block is the single entry

+/// point into the loop body, which may contain arbitrary control flow as long

+/// as all control paths eventually branch to the Latch block.

+///

+/// TODO: Consider adding another standardized BasicBlock between Body CFG and

+/// Latch to guarantee that there is only a single edge to the latch. It would

+/// make loop transformations easier to not needing to consider multiple

+/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us

+/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that

+/// executes after each body iteration.

+///

+/// There must be no loop-carried dependencies through llvm::Values. This is

+/// equivalant to that the Latch has no PHINode and the Header's only PHINode is

+/// for the induction variable.

+///

+/// All code in Header, Cond, Latch and Exit (plus the terminator of the

+/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked

+/// by assertOK(). They are expected to not be modified unless explicitly

+/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP

+/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,

+/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its

+/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used

+/// anymore as its underlying control flow may not exist anymore.

+/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop

+/// may also return a new CanonicalLoopInfo that can be passed to other

+/// loop-associated construct implementing methods. These loop-transforming

+/// methods may either create a new CanonicalLoopInfo usually using

+/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and

+/// modify one of the input CanonicalLoopInfo and return it as representing the

+/// modified loop. What is done is an implementation detail of

+/// transformation-implementing method and callers should always assume that the

+/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.

+/// Returned CanonicalLoopInfo have the same structure and guarantees as the one

+/// created by createCanonicalLoop, such that transforming methods do not have

+/// to special case where the CanonicalLoopInfo originated from.

+///

+/// Generally, methods consuming CanonicalLoopInfo do not need an

+/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the

+/// CanonicalLoopInfo to insert new or modify existing instructions. Unless

+/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate

+/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,

+/// any InsertPoint in the Preheader, After or Block can still be used after

+/// calling such a method.

///

-/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested

-/// classes.

+/// TODO: Provide mechanisms for exception handling and cancellation points.

+///

+/// Defined outside OpenMPIRBuilder because nested classes cannot be

+/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.

class CanonicalLoopInfo {

friend class OpenMPIRBuilder;

private:

- /// Whether this object currently represents a loop.

- bool IsValid = false;

- BasicBlock *Preheader;

- BasicBlock *Header;

- BasicBlock *Cond;

- BasicBlock *Body;

- BasicBlock *Latch;

- BasicBlock *Exit;

- BasicBlock *After;

+ BasicBlock *Preheader = nullptr;

+ BasicBlock *Header = nullptr;

+ BasicBlock *Cond = nullptr;

+ BasicBlock *Body = nullptr;

+ BasicBlock *Latch = nullptr;

+ BasicBlock *Exit = nullptr;

+ BasicBlock *After = nullptr;

/// Add the control blocks of this loop to \p BBs.

///

/// This does not include any block from the body, including the one returned

/// by getBody().

+ ///

+ /// FIXME: This currently includes the Preheader and After blocks even though

+ /// their content is (mostly) not under CanonicalLoopInfo's control.

+ /// Re-evaluated whether this makes sense.

void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);

public:

+ /// Returns whether this object currently represents the IR of a loop. If

+ /// returning false, it may have been consumed by a loop transformation or not

+ /// been intialized. Do not use in this case;

+ bool isValid() const { return Header; }

/// The preheader ensures that there is only a single edge entering the loop.

/// Code that must be execute before any loop iteration can be emitted here,

/// such as computing the loop trip count and begin lifetime markers. Code in

/// the preheader is not considered part of the canonical loop.

- BasicBlock *getPreheader() const { return Preheader; }

+ BasicBlock *getPreheader() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Preheader;

+ }

/// The header is the entry for each iteration. In the canonical control flow,

/// it only contains the PHINode for the induction variable.

- BasicBlock *getHeader() const { return Header; }

+ BasicBlock *getHeader() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Header;

+ }

/// The condition block computes whether there is another loop iteration. If

/// yes, branches to the body; otherwise to the exit block.

- BasicBlock *getCond() const { return Cond; }

+ BasicBlock *getCond() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Cond;

+ }

/// The body block is the single entry for a loop iteration and not controlled

/// by CanonicalLoopInfo. It can contain arbitrary control flow but must

/// eventually branch to the \p Latch block.

- BasicBlock *getBody() const { return Body; }

+ BasicBlock *getBody() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Body;

+ }

/// Reaching the latch indicates the end of the loop body code. In the

/// canonical control flow, it only contains the increment of the induction

/// variable.

- BasicBlock *getLatch() const { return Latch; }

+ BasicBlock *getLatch() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Latch;

+ }

/// Reaching the exit indicates no more iterations are being executed.

- BasicBlock *getExit() const { return Exit; }

+ BasicBlock *getExit() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Exit;

+ }

/// The after block is intended for clean-up code such as lifetime end

/// markers. It is separate from the exit block to ensure, analogous to the

/// preheader, it having just a single entry edge and being free from PHI

/// nodes should there be multiple loop exits (such as from break

/// statements/cancellations).

- BasicBlock *getAfter() const { return After; }

+ BasicBlock *getAfter() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return After;

+ }

/// Returns the llvm::Value containing the number of loop iterations. It must

/// be valid in the preheader and always interpreted as an unsigned integer of

/// any bit-width.

Value *getTripCount() const {

+ assert(isValid() && "Requires a valid canonical loop");

Instruction *CmpI = &Cond->front();

assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");

return CmpI->getOperand(1);

@@ -1216,33 +1500,47 @@ public:

/// Returns the instruction representing the current logical induction

/// variable. Always unsigned, always starting at 0 with an increment of one.

Instruction *getIndVar() const {

+ assert(isValid() && "Requires a valid canonical loop");

Instruction *IndVarPHI = &Header->front();

assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");

return IndVarPHI;

}

/// Return the type of the induction variable (and the trip count).

- Type *getIndVarType() const { return getIndVar()->getType(); }

+ Type *getIndVarType() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return getIndVar()->getType();

+ }

/// Return the insertion point for user code before the loop.

OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {

+ assert(isValid() && "Requires a valid canonical loop");

return {Preheader, std::prev(Preheader->end())};

};

/// Return the insertion point for user code in the body.

OpenMPIRBuilder::InsertPointTy getBodyIP() const {

+ assert(isValid() && "Requires a valid canonical loop");

return {Body, Body->begin()};

};

/// Return the insertion point for user code after the loop.

OpenMPIRBuilder::InsertPointTy getAfterIP() const {

+ assert(isValid() && "Requires a valid canonical loop");

return {After, After->begin()};

};

- Function *getFunction() const { return Header->getParent(); }

+ Function *getFunction() const {

+ assert(isValid() && "Requires a valid canonical loop");

+ return Header->getParent();

+ }

/// Consistency self-check.

void assertOK() const;

+ /// Invalidate this loop. That is, the underlying IR does not fulfill the

+ /// requirements of an OpenMP canonical loop anymore.

+ void invalidate();

};

} // end namespace llvm

diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index eb673b199fc4..8e4f7568fb9c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def

@@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)

__OMP_TYPE(Int64Ptr)

OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))

-OMP_TYPE(LanemaskTy, getLanemaskType())

#define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())

@@ -272,6 +271,15 @@ __OMP_RTL(__kmpc_for_static_init_8, false, Void, IdentPtr, Int32, Int32,

__OMP_RTL(__kmpc_for_static_init_8u, false, Void, IdentPtr, Int32, Int32,

Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)

__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32)

+__OMP_RTL(__kmpc_distribute_static_init_4, false, Void, IdentPtr, Int32, Int32,

+ Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)

+__OMP_RTL(__kmpc_distribute_static_init_4u, false, Void, IdentPtr, Int32, Int32,

+ Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)

+__OMP_RTL(__kmpc_distribute_static_init_8, false, Void, IdentPtr, Int32, Int32,

+ Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)

+__OMP_RTL(__kmpc_distribute_static_init_8u, false, Void, IdentPtr, Int32, Int32,

+ Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)

+__OMP_RTL(__kmpc_distribute_static_fini, false, Void, IdentPtr, Int32)

__OMP_RTL(__kmpc_dist_dispatch_init_4, false, Void, IdentPtr, Int32, Int32,

Int32Ptr, Int32, Int32, Int32, Int32)

__OMP_RTL(__kmpc_dist_dispatch_init_4u, false, Void, IdentPtr, Int32, Int32,

@@ -415,8 +423,8 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,

/* Int */ Int32, /* kmp_task_t */ VoidPtr)

/// OpenMP Device runtime functions

-__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1)

-__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1)

+__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1, Int1)

+__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8, Int1)

__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)

__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,

VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)

@@ -442,9 +450,12 @@ __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)

__OMP_RTL(__kmpc_parallel_level, false, Int8, )

__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )

__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)

+__OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)

-__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)

-__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)

+__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)

+__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)

+__OMP_RTL(__kmpc_get_warp_size, false, Int32, )

__OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)

@@ -510,6 +521,11 @@ __OMP_ATTRS_SET(NoCaptureAttrs,

? AttributeSet(EnumAttr(NoCapture))

: AttributeSet(EnumAttr(NoCapture)))

+__OMP_ATTRS_SET(AlwaysInlineAttrs,

+ OptimisticAttributes

+ ? AttributeSet(EnumAttr(AlwaysInline))

+ : AttributeSet(EnumAttr(AlwaysInline)))

#if 0

__OMP_ATTRS_SET(InaccessibleOnlyAttrs,

OptimisticAttributes

@@ -535,6 +551,11 @@ __OMP_ATTRS_SET(ReadOnlyPtrAttrs,

EnumAttr(NoCapture))

: AttributeSet())

+__OMP_ATTRS_SET(DeviceAllocAttrs,

+ OptimisticAttributes

+ ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync))

+ : AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync)))

#if 0

__OMP_ATTRS_SET(WriteOnlyPtrAttrs,

OptimisticAttributes

@@ -575,6 +596,8 @@ __OMP_RTL_ATTRS(__kmpc_barrier, BarrierAttrs, AttributeSet(),

ParamAttrs(ReadOnlyPtrAttrs))

__OMP_RTL_ATTRS(__kmpc_barrier_simple_spmd, BarrierAttrs, AttributeSet(),

ParamAttrs(ReadOnlyPtrAttrs))

+__OMP_RTL_ATTRS(__kmpc_barrier_simple_generic, BarrierAttrs, AttributeSet(),

+ ParamAttrs(ReadOnlyPtrAttrs))

__OMP_RTL_ATTRS(__kmpc_warp_active_thread_mask, BarrierAttrs, AttributeSet(),

ParamAttrs())

__OMP_RTL_ATTRS(__kmpc_syncwarp, BarrierAttrs, AttributeSet(), ParamAttrs())

@@ -703,6 +726,28 @@ __OMP_RTL_ATTRS(__kmpc_for_static_init_8u, GetterArgWriteAttrs, AttributeSet(),

AttributeSet(), AttributeSet()))

__OMP_RTL_ATTRS(__kmpc_for_static_fini, InaccessibleArgOnlyAttrs,

AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))

+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4, GetterArgWriteAttrs,

+ AttributeSet(),

+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),

+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,

+ AttributeSet(), AttributeSet()))

+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4u, GetterArgWriteAttrs,

+ AttributeSet(),

+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),

+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,

+ AttributeSet(), AttributeSet()))

+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8, GetterArgWriteAttrs,

+ AttributeSet(),

+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),

+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,

+ AttributeSet(), AttributeSet()))

+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8u, GetterArgWriteAttrs,

+ AttributeSet(),

+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),

+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,

+ AttributeSet(), AttributeSet()))

+__OMP_RTL_ATTRS(__kmpc_distribute_static_fini, InaccessibleArgOnlyAttrs,

+ AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))

__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4, GetterArgWriteAttrs,

AttributeSet(),

ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),

@@ -854,9 +899,9 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),

__OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),

ParamAttrs(ReadOnlyPtrAttrs))

-__OMP_RTL_ATTRS(__kmpc_alloc_shared, DefaultAttrs, ReturnPtrAttrs,

+__OMP_RTL_ATTRS(__kmpc_alloc_shared, DeviceAllocAttrs, ReturnPtrAttrs,

ParamAttrs())

-__OMP_RTL_ATTRS(__kmpc_free_shared, AllocAttrs, AttributeSet(),

+__OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),

ParamAttrs(NoCaptureAttrs))

__OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())

@@ -897,6 +942,9 @@ __OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(),

__OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,

ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs))

+__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),

+ ParamAttrs())

#undef __OMP_RTL_ATTRS

#undef OMP_RTL_ATTRS

#undef AttributeSet

@@ -920,6 +968,7 @@ __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,

OMP_IDENT_FLAG(OMP_IDENT_FLAG_##Name, #Name, Value)

__OMP_IDENT_FLAG(KMPC, 0x02)

+__OMP_IDENT_FLAG(ATOMIC_REDUCE, 0x10)

__OMP_IDENT_FLAG(BARRIER_EXPL, 0x20)

__OMP_IDENT_FLAG(BARRIER_IMPL, 0x0040)

__OMP_IDENT_FLAG(BARRIER_IMPL_MASK, 0x01C0)