aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/include/llvm/Frontend
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2022-03-20 11:40:34 +0000
committerDimitry Andric <dim@FreeBSD.org>2022-05-14 11:43:05 +0000
commit349cc55c9796c4596a5b9904cd3281af295f878f (patch)
tree410c5a785075730a35f1272ca6a7adf72222ad03 /contrib/llvm-project/llvm/include/llvm/Frontend
parentcb2ae6163174b90e999326ecec3699ee093a5d43 (diff)
parentc0981da47d5696fe36474fcf86b4ce03ae3ff818 (diff)
downloadsrc-349cc55c9796c4596a5b9904cd3281af295f878f.tar.gz
src-349cc55c9796c4596a5b9904cd3281af295f878f.zip
Diffstat (limited to 'contrib/llvm-project/llvm/include/llvm/Frontend')
-rw-r--r--contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td69
-rw-r--r--contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h8
-rw-r--r--contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h117
-rw-r--r--contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h430
-rw-r--r--contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def63
5 files changed, 544 insertions, 143 deletions
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
index 3dc6194c7830..5ee379b7fcad 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -144,6 +144,26 @@ def OMPC_Schedule : Clause<"schedule"> {
];
}
+def OMP_MEMORY_ORDER_SeqCst : ClauseVal<"seq_cst", 1, 1> {}
+def OMP_MEMORY_ORDER_AcqRel : ClauseVal<"acq_rel", 2, 1> {}
+def OMP_MEMORY_ORDER_Acquire : ClauseVal<"acquire", 3, 1> {}
+def OMP_MEMORY_ORDER_Release : ClauseVal<"release", 4, 1> {}
+def OMP_MEMORY_ORDER_Relaxed : ClauseVal<"relaxed", 5, 1> {}
+def OMP_MEMORY_ORDER_Default : ClauseVal<"default", 6, 0> {
+ let isDefault = 1;
+}
+def OMPC_MemoryOrder : Clause<"memory_order"> {
+ let enumClauseValue = "MemoryOrderKind";
+ let allowedClauseValues = [
+ OMP_MEMORY_ORDER_SeqCst,
+ OMP_MEMORY_ORDER_AcqRel,
+ OMP_MEMORY_ORDER_Acquire,
+ OMP_MEMORY_ORDER_Release,
+ OMP_MEMORY_ORDER_Relaxed,
+ OMP_MEMORY_ORDER_Default
+ ];
+}
+
def OMPC_Ordered : Clause<"ordered"> {
let clangClass = "OMPOrderedClause";
let flangClass = "ScalarIntConstantExpr";
@@ -261,13 +281,17 @@ def OMPC_Allocate : Clause<"allocate"> {
}
def OMPC_NonTemporal : Clause<"nontemporal"> {
let clangClass = "OMPNontemporalClause";
+ let flangClass = "Name";
+ let isValueList = true;
}
-def OMP_ORDER_concurrent : ClauseVal<"default",2,0> { let isDefault = 1; }
+def OMP_ORDER_concurrent : ClauseVal<"concurrent",1,1> {}
+def OMP_ORDER_unknown : ClauseVal<"unknown",2,0> { let isDefault = 1; }
def OMPC_Order : Clause<"order"> {
let clangClass = "OMPOrderClause";
let enumClauseValue = "OrderKind";
let allowedClauseValues = [
+ OMP_ORDER_unknown,
OMP_ORDER_concurrent
];
}
@@ -312,6 +336,8 @@ def OMPC_Uniform : Clause<"uniform"> {
}
def OMPC_DeviceType : Clause<"device_type"> {}
def OMPC_Match : Clause<"match"> {}
+def OMPC_AdjustArgs : Clause<"adjust_args"> { }
+def OMPC_AppendArgs : Clause<"append_args"> { }
def OMPC_Depobj : Clause<"depobj"> {
let clangClass = "OMPDepobjClause";
let isImplicit = true;
@@ -337,6 +363,14 @@ def OMPC_Filter : Clause<"filter"> {
let clangClass = "OMPFilterClause";
let flangClass = "ScalarIntExpr";
}
+def OMPC_Align : Clause<"align"> {
+ let clangClass = "OMPAlignClause";
+}
+def OMPC_When: Clause<"when"> {}
+
+def OMPC_Bind : Clause<"bind"> {
+ let clangClass = "OMPBindClause";
+}
//===----------------------------------------------------------------------===//
// Definition of OpenMP directives
@@ -473,8 +507,8 @@ def OMP_TaskWait : Directive<"taskwait"> {
}
def OMP_TaskGroup : Directive<"taskgroup"> {
let allowedClauses = [
- VersionedClause<OMPC_TaskReduction>,
- VersionedClause<OMPC_Allocate>
+ VersionedClause<OMPC_TaskReduction, 50>,
+ VersionedClause<OMPC_Allocate, 50>
];
}
def OMP_Flush : Directive<"flush"> {
@@ -489,10 +523,12 @@ def OMP_Flush : Directive<"flush"> {
}
def OMP_Ordered : Directive<"ordered"> {
let allowedClauses = [
- VersionedClause<OMPC_Threads>,
- VersionedClause<OMPC_Simd>,
VersionedClause<OMPC_Depend>
];
+ let allowedOnceClauses = [
+ VersionedClause<OMPC_Threads>,
+ VersionedClause<OMPC_Simd>
+ ];
}
def OMP_Atomic : Directive<"atomic"> {
let allowedClauses = [
@@ -1506,13 +1542,18 @@ def OMP_TargetTeamsDistributeSimd :
}
def OMP_Allocate : Directive<"allocate"> {
let allowedOnceClauses = [
- VersionedClause<OMPC_Allocator>
+ VersionedClause<OMPC_Allocator>,
+ VersionedClause<OMPC_Align, 51>
];
}
def OMP_DeclareVariant : Directive<"declare variant"> {
let allowedClauses = [
VersionedClause<OMPC_Match>
];
+ let allowedExclusiveClauses = [
+ VersionedClause<OMPC_AdjustArgs, 51>,
+ VersionedClause<OMPC_AppendArgs, 51>
+ ];
}
def OMP_MasterTaskloop : Directive<"master taskloop"> {
let allowedClauses = [
@@ -1699,6 +1740,22 @@ def OMP_masked : Directive<"masked"> {
VersionedClause<OMPC_Filter>
];
}
+def OMP_loop : Directive<"loop"> {
+ let allowedClauses = [
+ VersionedClause<OMPC_LastPrivate>,
+ VersionedClause<OMPC_Private>,
+ VersionedClause<OMPC_Reduction>,
+ ];
+ let allowedOnceClauses = [
+ VersionedClause<OMPC_Bind, 50>,
+ VersionedClause<OMPC_Collapse>,
+ VersionedClause<OMPC_Order>,
+ ];
+}
+def OMP_Metadirective : Directive<"metadirective"> {
+ let allowedClauses = [VersionedClause<OMPC_When>];
+ let allowedOnceClauses = [VersionedClause<OMPC_Default>];
+}
def OMP_Unknown : Directive<"unknown"> {
let isDefault = true;
}
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
index d174cc8992dd..2fec3e7e4230 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPConstants.h
@@ -128,6 +128,14 @@ enum class OMPScheduleType {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierMask)
};
+enum OMPTgtExecModeFlags : int8_t {
+ OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
+ OMP_TGT_EXEC_MODE_SPMD = 1 << 1,
+ OMP_TGT_EXEC_MODE_GENERIC_SPMD =
+ OMP_TGT_EXEC_MODE_GENERIC | OMP_TGT_EXEC_MODE_SPMD,
+ LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ OMP_TGT_EXEC_MODE_GENERIC_SPMD)
+};
+
} // end namespace omp
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
index 0b6aed1e9e12..89f5de229b3b 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPGridValues.h
@@ -29,100 +29,89 @@ namespace omp {
///
/// Example usage in clang:
/// const unsigned slot_size =
-/// ctx.GetTargetInfo().getGridValue(llvm::omp::GVIDX::GV_Warp_Size);
+/// ctx.GetTargetInfo().getGridValue().GV_Warp_Size;
///
/// Example usage in libomptarget/deviceRTLs:
/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"
/// #ifdef __AMDGPU__
-/// #define GRIDVAL AMDGPUGpuGridValues
+/// #define GRIDVAL AMDGPUGridValues
/// #else
-/// #define GRIDVAL NVPTXGpuGridValues
+/// #define GRIDVAL NVPTXGridValues
/// #endif
/// ... Then use this reference for GV_Warp_Size in the deviceRTL source.
-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+/// llvm::omp::GRIDVAL().GV_Warp_Size
///
/// Example usage in libomptarget hsa plugin:
/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-/// #define GRIDVAL AMDGPUGpuGridValues
+/// #define GRIDVAL AMDGPUGridValues
/// ... Then use this reference to access GV_Warp_Size in the hsa plugin.
-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+/// llvm::omp::GRIDVAL().GV_Warp_Size
///
/// Example usage in libomptarget cuda plugin:
/// #include "llvm/Frontend/OpenMP/OMPGridValues.h"
-/// #define GRIDVAL NVPTXGpuGridValues
+/// #define GRIDVAL NVPTXGridValues
/// ... Then use this reference to access GV_Warp_Size in the cuda plugin.
-/// llvm::omp::GRIDVAL[llvm::omp::GVIDX::GV_Warp_Size]
+/// llvm::omp::GRIDVAL().GV_Warp_Size
///
-enum GVIDX {
- /// The maximum number of workers in a kernel.
- /// (THREAD_ABSOLUTE_LIMIT) - (GV_Warp_Size), might be issue for blockDim.z
- GV_Threads,
+
+struct GV {
/// The size reserved for data in a shared memory slot.
- GV_Slot_Size,
+ const unsigned GV_Slot_Size;
/// The default value of maximum number of threads in a worker warp.
- GV_Warp_Size,
- /// Alternate warp size for some AMDGCN architectures. Same as GV_Warp_Size
- /// for NVPTX.
- GV_Warp_Size_32,
- /// The number of bits required to represent the max number of threads in warp
- GV_Warp_Size_Log2,
- /// GV_Warp_Size * GV_Slot_Size,
- GV_Warp_Slot_Size,
+ const unsigned GV_Warp_Size;
+
+ constexpr unsigned warpSlotSize() const {
+ return GV_Warp_Size * GV_Slot_Size;
+ }
+
/// the maximum number of teams.
- GV_Max_Teams,
- /// Global Memory Alignment
- GV_Mem_Align,
- /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
- GV_Warp_Size_Log2_Mask,
+ const unsigned GV_Max_Teams;
// An alternative to the heavy data sharing infrastructure that uses global
// memory is one that uses device __shared__ memory. The amount of such space
// (in bytes) reserved by the OpenMP runtime is noted here.
- GV_SimpleBufferSize,
+ const unsigned GV_SimpleBufferSize;
// The absolute maximum team size for a working group
- GV_Max_WG_Size,
+ const unsigned GV_Max_WG_Size;
// The default maximum team size for a working group
- GV_Default_WG_Size,
- // This is GV_Max_WG_Size / GV_WarpSize. 32 for NVPTX and 16 for AMDGCN.
- GV_Max_Warp_Number,
- /// The slot size that should be reserved for a working warp.
- /// (~0u >> (GV_Warp_Size - GV_Warp_Size_Log2))
- GV_Warp_Size_Log2_MaskL
+ const unsigned GV_Default_WG_Size;
+
+ constexpr unsigned maxWarpNumber() const {
+ return GV_Max_WG_Size / GV_Warp_Size;
+ }
};
/// For AMDGPU GPUs
-static constexpr unsigned AMDGPUGpuGridValues[] = {
- 448, // GV_Threads
- 256, // GV_Slot_Size
- 64, // GV_Warp_Size
- 32, // GV_Warp_Size_32
- 6, // GV_Warp_Size_Log2
- 64 * 256, // GV_Warp_Slot_Size
- 128, // GV_Max_Teams
- 256, // GV_Mem_Align
- 63, // GV_Warp_Size_Log2_Mask
- 896, // GV_SimpleBufferSize
- 1024, // GV_Max_WG_Size,
- 256, // GV_Defaut_WG_Size
- 1024 / 64, // GV_Max_WG_Size / GV_WarpSize
- 63 // GV_Warp_Size_Log2_MaskL
+static constexpr GV AMDGPUGridValues64 = {
+ 256, // GV_Slot_Size
+ 64, // GV_Warp_Size
+ 128, // GV_Max_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size,
+ 256, // GV_Default_WG_Size
};
+static constexpr GV AMDGPUGridValues32 = {
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
+ 128, // GV_Max_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size,
+ 256, // GV_Default_WG_Size
+};
+
+template <unsigned wavesize> constexpr const GV &getAMDGPUGridValues() {
+ static_assert(wavesize == 32 || wavesize == 64, "");
+ return wavesize == 32 ? AMDGPUGridValues32 : AMDGPUGridValues64;
+}
+
/// For Nvidia GPUs
-static constexpr unsigned NVPTXGpuGridValues[] = {
- 992, // GV_Threads
- 256, // GV_Slot_Size
- 32, // GV_Warp_Size
- 32, // GV_Warp_Size_32
- 5, // GV_Warp_Size_Log2
- 32 * 256, // GV_Warp_Slot_Size
- 1024, // GV_Max_Teams
- 256, // GV_Mem_Align
- (~0u >> (32 - 5)), // GV_Warp_Size_Log2_Mask
- 896, // GV_SimpleBufferSize
- 1024, // GV_Max_WG_Size
- 128, // GV_Defaut_WG_Size
- 1024 / 32, // GV_Max_WG_Size / GV_WarpSize
- 31 // GV_Warp_Size_Log2_MaskL
+static constexpr GV NVPTXGridValues = {
+ 256, // GV_Slot_Size
+ 32, // GV_Warp_Size
+ 1024, // GV_Max_Teams
+ 896, // GV_SimpleBufferSize
+ 1024, // GV_Max_WG_Size
+ 128, // GV_Default_WG_Size
};
} // namespace omp
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 8144f1527a06..563e0eed1762 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -257,18 +257,17 @@ public:
///
/// * Sign of the step and the comparison operator might disagree:
///
- /// for (int i = 0; i < 42; --i)
+ /// for (int i = 0; i < 42; i -= 1u)
///
//
/// \param Loc The insert and source location description.
/// \param BodyGenCB Callback that will generate the loop body code.
/// \param Start Value of the loop counter for the first iterations.
- /// \param Stop Loop counter values past this will stop the the
- /// iterations.
+ /// \param Stop Loop counter values past this will stop the loop.
/// \param Step Loop counter increment after each iteration; negative
- /// means counting down. \param IsSigned Whether Start, Stop
- /// and Stop are signed integers.
- /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
+ /// means counting down.
+ /// \param IsSigned Whether Start, Stop and Step are signed integers.
+ /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
/// counter.
/// \param ComputeIP Insertion point for instructions computing the trip
/// count. Can be used to ensure the trip count is available
@@ -335,7 +334,7 @@ public:
/// has a trip count of 0). This is permitted by the OpenMP specification.
///
/// \param DL Debug location for instructions added for collapsing,
- /// such as instructions to compute derive the input loop's
+ /// such as instructions to compute/derive the input loop's
/// induction variables.
/// \param Loops Loops in the loop nest to collapse. Loops are specified
/// from outermost-to-innermost and every control flow of a
@@ -358,8 +357,16 @@ public:
/// the current thread, updates the relevant instructions in the canonical
/// loop and calls to an OpenMP runtime finalization function after the loop.
///
- /// \param Loc The source location description, the insertion location
- /// is not used.
+ /// TODO: Workshare loops with static scheduling may contain up to two loops
+ /// that fulfill the requirements of an OpenMP canonical loop. One for
+ /// iterating over all iterations of a chunk and another one for iterating
+ /// over all chunks that are executed on the same thread. Returning
+ /// CanonicalLoopInfo objects representing them may eventually be useful for
+ /// the apply clause planned in OpenMP 6.0, but currently whether these are
+ /// canonical loops is irrelevant.
+ ///
+ /// \param DL Debug location for instructions added for the
+ /// workshare-loop construct itself.
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
@@ -368,12 +375,11 @@ public:
/// \param Chunk The size of loop chunk considered as a unit when
/// scheduling. If \p nullptr, defaults to 1.
///
- /// \returns Updated CanonicalLoopInfo.
- CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,
- CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP,
- bool NeedsBarrier,
- Value *Chunk = nullptr);
+ /// \returns Point where to insert code after the workshare construct.
+ InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP,
+ bool NeedsBarrier,
+ Value *Chunk = nullptr);
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
///
@@ -382,8 +388,9 @@ public:
/// turn it into a workshare loop. In particular, it calls to an OpenMP
/// runtime function in the preheader to obtain, and then in each iteration
/// to update the loop counter.
- /// \param Loc The source location description, the insertion location
- /// is not used.
+ ///
+ /// \param DL Debug location for instructions added for the
+ /// workshare-loop construct itself.
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
@@ -393,13 +400,12 @@ public:
/// \param Chunk The size of loop chunk considered as a unit when
/// scheduling. If \p nullptr, defaults to 1.
///
- /// \returns Point where to insert code after the loop.
- InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
- CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP,
- omp::OMPScheduleType SchedType,
- bool NeedsBarrier,
- Value *Chunk = nullptr);
+ /// \returns Point where to insert code after the workshare construct.
+ InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP,
+ omp::OMPScheduleType SchedType,
+ bool NeedsBarrier,
+ Value *Chunk = nullptr);
/// Modifies the canonical loop to be a workshare loop.
///
@@ -410,19 +416,17 @@ public:
/// the current thread, updates the relevant instructions in the canonical
/// loop and calls to an OpenMP runtime finalization function after the loop.
///
- /// \param Loc The source location description, the insertion location
- /// is not used.
+ /// \param DL Debug location for instructions added for the
+ /// workshare-loop construct itself.
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
/// the loop.
///
- /// \returns Updated CanonicalLoopInfo.
- CanonicalLoopInfo *createWorkshareLoop(const LocationDescription &Loc,
- CanonicalLoopInfo *CLI,
- InsertPointTy AllocaIP,
- bool NeedsBarrier);
+ /// \returns Point where to insert code after the workshare construct.
+ InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
+ InsertPointTy AllocaIP, bool NeedsBarrier);
/// Tile a loop nest.
///
@@ -471,6 +475,48 @@ public:
tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
ArrayRef<Value *> TileSizes);
+ /// Fully unroll a loop.
+ ///
+ /// Instead of unrolling the loop immediately (and duplicating its body
+ /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
+ /// metadata.
+ ///
+ /// \param DL Debug location for instructions added by unrolling.
+ /// \param Loop The loop to unroll. The loop will be invalidated.
+ void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
+
+ /// Fully or partially unroll a loop. How the loop is unrolled is determined
+ /// using LLVM's LoopUnrollPass.
+ ///
+ /// \param DL Debug location for instructions added by unrolling.
+ /// \param Loop The loop to unroll. The loop will be invalidated.
+ void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
+
+ /// Partially unroll a loop.
+ ///
+ /// The CanonicalLoopInfo of the unrolled loop for use with chained
+ /// loop-associated directive can be requested using \p UnrolledCLI. Not
+ /// needing the CanonicalLoopInfo allows more efficient code generation by
+ /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
+ /// A loop-associated directive applied to the unrolled loop needs to know the
+ /// new trip count which means that if using a heuristically determined unroll
+ /// factor (\p Factor == 0), that factor must be computed immediately. We are
+ /// using the same logic as the LoopUnrollPass to derived the unroll factor,
+ /// but which assumes that some canonicalization has taken place (e.g.
+ /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
+ /// better when the unrolled loop's CanonicalLoopInfo is not needed.
+ ///
+ /// \param DL Debug location for instructions added by unrolling.
+ /// \param Loop The loop to unroll. The loop will be invalidated.
+ /// \param Factor The factor to unroll the loop by. A factor of 0
+ /// indicates that a heuristic should be used to determine
+ /// the unroll-factor.
+ /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
+ /// partially unrolled loop. Otherwise, uses loop metadata
+ /// to defer unrolling to the LoopUnrollPass.
+ void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
+ CanonicalLoopInfo **UnrolledCLI);
+
/// Generator for '#omp flush'
///
/// \param Loc The location where the flush directive was encountered
@@ -486,6 +532,115 @@ public:
/// \param Loc The location where the taskyield directive was encountered.
void createTaskyield(const LocationDescription &Loc);
+ /// Functions used to generate reductions. Such functions take two Values
+ /// representing LHS and RHS of the reduction, respectively, and a reference
+ /// to the value that is updated to refer to the reduction result.
+ using ReductionGenTy =
+ function_ref<InsertPointTy(InsertPointTy, Value *, Value *, Value *&)>;
+
+ /// Functions used to generate atomic reductions. Such functions take two
+ /// Values representing pointers to LHS and RHS of the reduction. They are
+ /// expected to atomically update the LHS to the reduced value.
+ using AtomicReductionGenTy =
+ function_ref<InsertPointTy(InsertPointTy, Value *, Value *)>;
+
+ /// Information about an OpenMP reduction.
+ struct ReductionInfo {
+ ReductionInfo(Value *Variable, Value *PrivateVariable,
+ ReductionGenTy ReductionGen,
+ AtomicReductionGenTy AtomicReductionGen)
+ : Variable(Variable), PrivateVariable(PrivateVariable),
+ ReductionGen(ReductionGen), AtomicReductionGen(AtomicReductionGen) {}
+
+ /// Returns the type of the element being reduced.
+ Type *getElementType() const {
+ return Variable->getType()->getPointerElementType();
+ }
+
+ /// Reduction variable of pointer type.
+ Value *Variable;
+
+ /// Thread-private partial reduction variable.
+ Value *PrivateVariable;
+
+ /// Callback for generating the reduction body. The IR produced by this will
+ /// be used to combine two values in a thread-safe context, e.g., under
+ /// lock or within the same thread, and therefore need not be atomic.
+ ReductionGenTy ReductionGen;
+
+ /// Callback for generating the atomic reduction body, may be null. The IR
+ /// produced by this will be used to atomically combine two values during
+ /// reduction. If null, the implementation will use the non-atomic version
+ /// along with the appropriate synchronization mechanisms.
+ AtomicReductionGenTy AtomicReductionGen;
+ };
+
+ // TODO: provide atomic and non-atomic reduction generators for reduction
+ // operators defined by the OpenMP specification.
+
+ /// Generator for '#omp reduction'.
+ ///
+ /// Emits the IR instructing the runtime to perform the specific kind of
+ /// reductions. Expects reduction variables to have been privatized and
+ /// initialized to reduction-neutral values separately. Emits the calls to
+ /// runtime functions as well as the reduction function and the basic blocks
+ /// performing the reduction atomically and non-atomically.
+ ///
+ /// The code emitted for the following:
+ ///
+ /// \code
+ /// type var_1;
+ /// type var_2;
+ /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
+ /// /* body */;
+ /// \endcode
+ ///
+ /// corresponds to the following sketch.
+ ///
+ /// \code
+ /// void _outlined_par() {
+ /// // N is the number of different reductions.
+ /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
+ /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
+ /// _omp_reduction_func,
+ /// _gomp_critical_user.reduction.var)) {
+ /// case 1: {
+ /// var_1 = var_1 <reduction-op> privatized_var_1;
+ /// var_2 = var_2 <reduction-op> privatized_var_2;
+ /// // ...
+ /// __kmpc_end_reduce(...);
+ /// break;
+ /// }
+ /// case 2: {
+ /// _Atomic<ReductionOp>(var_1, privatized_var_1);
+ /// _Atomic<ReductionOp>(var_2, privatized_var_2);
+ /// // ...
+ /// break;
+ /// }
+ /// default: break;
+ /// }
+ /// }
+ ///
+ /// void _omp_reduction_func(void **lhs, void **rhs) {
+ /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
+ /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
+ /// // ...
+ /// }
+ /// \endcode
+ ///
+ /// \param Loc The location where the reduction was
+ /// encountered. Must be within the associate
+ /// directive and after the last local access to the
+ /// reduction variables.
+ /// \param AllocaIP An insertion point suitable for allocas usable
+ /// in reductions.
+ /// \param ReductionInfos A list of info on each reduction variable.
+ /// \param IsNoWait A flag set if the reduction is marked as nowait.
+ InsertPointTy createReductions(const LocationDescription &Loc,
+ InsertPointTy AllocaIP,
+ ArrayRef<ReductionInfo> ReductionInfos,
+ bool IsNoWait = false);
+
///}
/// Return the insertion point used by the underlying IRBuilder.
@@ -515,6 +670,10 @@ public:
Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
unsigned Line, unsigned Column);
+ /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
+ /// fallback if \p DL does not specify the function name.
+ Constant *getOrCreateSrcLocStr(DebugLoc DL, Function *F = nullptr);
+
/// Return the (LLVM-IR) string describing the source location \p Loc.
Constant *getOrCreateSrcLocStr(const LocationDescription &Loc);
@@ -524,8 +683,8 @@ public:
omp::IdentFlag Flags = omp::IdentFlag(0),
unsigned Reserve2Flags = 0);
- // Get the type corresponding to __kmpc_impl_lanemask_t from the deviceRTL
- Type *getLanemaskType();
+ /// Create a global flag \p Namein the module with initial value \p Value.
+ GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
/// Generate control flow and cleanup for cancellation.
///
@@ -651,11 +810,11 @@ public:
/// \param Loc The source location description.
/// \param MapperFunc Function to be called.
/// \param SrcLocInfo Source location information global.
- /// \param MaptypesArgs
- /// \param MapnamesArg
+ /// \param MaptypesArg The argument types.
+ /// \param MapnamesArg The argument names.
/// \param MapperAllocas The AllocaInst used for the call.
/// \param DeviceID Device ID for the call.
- /// \param TotalNbOperand Number of operand in the call.
+ /// \param NumOperands Number of operands in the call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
struct MapperAllocas &MapperAllocas, int64_t DeviceID,
@@ -705,7 +864,7 @@ public:
/// \param BodyGenCB Callback that will generate the region code.
/// \param FiniCB Callback to finialize variable copies.
///
- /// \returns The insertion position *after* the master.
+ /// \returns The insertion position *after* the masked.
InsertPointTy createMasked(const LocationDescription &Loc,
BodyGenCallbackTy BodyGenCB,
FinalizeCallbackTy FiniCB, Value *Filter);
@@ -718,12 +877,41 @@ public:
/// \param CriticalName name of the lock used by the critical directive
/// \param HintInst Hint Instruction for hint clause associated with critical
///
- /// \returns The insertion position *after* the master.
+ /// \returns The insertion position *after* the critical.
InsertPointTy createCritical(const LocationDescription &Loc,
BodyGenCallbackTy BodyGenCB,
FinalizeCallbackTy FiniCB,
StringRef CriticalName, Value *HintInst);
+ /// Generator for '#omp ordered depend (source | sink)'
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param AllocaIP The insertion point to be used for alloca instructions.
+ /// \param NumLoops The number of loops in depend clause.
+ /// \param StoreValues The value will be stored in vector address.
+ /// \param Name The name of alloca instruction.
+ /// \param IsDependSource If true, depend source; otherwise, depend sink.
+ ///
+ /// \return The insertion position *after* the ordered.
+ InsertPointTy createOrderedDepend(const LocationDescription &Loc,
+ InsertPointTy AllocaIP, unsigned NumLoops,
+ ArrayRef<llvm::Value *> StoreValues,
+ const Twine &Name, bool IsDependSource);
+
+ /// Generator for '#omp ordered [threads | simd]'
+ ///
+ /// \param Loc The insert and source location description.
+ /// \param BodyGenCB Callback that will generate the region code.
+ /// \param FiniCB Callback to finalize variable copies.
+ /// \param IsThreads If true, with threads clause or without clause;
+ /// otherwise, with simd clause;
+ ///
+ /// \returns The insertion position *after* the ordered.
+ InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
+ BodyGenCallbackTy BodyGenCB,
+ FinalizeCallbackTy FiniCB,
+ bool IsThreads);
+
/// Generator for '#omp sections'
///
/// \param Loc The insert and source location description.
@@ -816,14 +1004,16 @@ public:
/// \param Loc The insert and source location description.
/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
- InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+ InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
+ bool RequiresFullRuntime);
/// Create a runtime call for kmpc_target_deinit
///
/// \param Loc The insert and source location description.
/// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
/// \param RequiresFullRuntime Indicate if a full device runtime is necessary.
- void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD, bool RequiresFullRuntime);
+ void createTargetDeinit(const LocationDescription &Loc, bool IsSPMD,
+ bool RequiresFullRuntime);
///}
@@ -1121,7 +1311,25 @@ public:
/// The control-flow structure is standardized for easy consumption by
/// directives associated with loops. For instance, the worksharing-loop
/// construct may change this control flow such that each loop iteration is
-/// executed on only one thread.
+/// executed on only one thread. The constraints of a canonical loop in brief
+/// are:
+///
+/// * The number of loop iterations must have been computed before entering the
+/// loop.
+///
+/// * Has an (unsigned) logical induction variable that starts at zero and
+/// increments by one.
+///
+/// * The loop's CFG itself has no side-effects. The OpenMP specification
+/// itself allows side-effects, but the order in which they happen, including
+/// how often or whether at all, is unspecified. We expect that the frontend
+/// will emit those side-effect instructions somewhere (e.g. before the loop)
+/// such that the CanonicalLoopInfo itself can be side-effect free.
+///
+/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
+/// execution of a loop body that satifies these constraints. It does NOT
+/// represent arbitrary SESE regions that happen to contain a loop. Do not use
+/// CanonicalLoopInfo for such purposes.
///
/// The control flow can be described as follows:
///
@@ -1141,73 +1349,149 @@ public:
/// |
/// After
///
-/// Code in the header, condition block, latch and exit block must not have any
-/// side-effect. The body block is the single entry point into the loop body,
-/// which may contain arbitrary control flow as long as all control paths
-/// eventually branch to the latch block.
+/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
+/// including) and end at AfterIP (at the After's first instruction, excluding).
+/// That is, instructions in the Preheader and After blocks (except the
+/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
+/// side-effects. Typically, the Preheader is used to compute the loop's trip
+/// count. The instructions from BodyIP (at the Body block's first instruction,
+/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
+/// control and thus can have side-effects. The body block is the single entry
+/// point into the loop body, which may contain arbitrary control flow as long
+/// as all control paths eventually branch to the Latch block.
+///
+/// TODO: Consider adding another standardized BasicBlock between Body CFG and
+/// Latch to guarantee that there is only a single edge to the latch. It would
+/// make loop transformations easier to not needing to consider multiple
+/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
+/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
+/// executes after each body iteration.
+///
+/// There must be no loop-carried dependencies through llvm::Values. This is
+/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
+/// for the induction variable.
+///
+/// All code in Header, Cond, Latch and Exit (plus the terminator of the
+/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
+/// by assertOK(). They are expected to not be modified unless explicitly
+/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
+/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
+/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
+/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
+/// anymore as its underlying control flow may not exist anymore.
+/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
+/// may also return a new CanonicalLoopInfo that can be passed to other
+/// loop-associated construct implementing methods. These loop-transforming
+/// methods may either create a new CanonicalLoopInfo usually using
+/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
+/// modify one of the input CanonicalLoopInfo and return it as representing the
+/// modified loop. What is done is an implementation detail of
+/// transformation-implementing method and callers should always assume that the
+/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
+/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
+/// created by createCanonicalLoop, such that transforming methods do not have
+/// to special case where the CanonicalLoopInfo originated from.
+///
+/// Generally, methods consuming CanonicalLoopInfo do not need an
+/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
+/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
+/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
+/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
+/// any InsertPoint in the Preheader, After or Block can still be used after
+/// calling such a method.
///
-/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested
-/// classes.
+/// TODO: Provide mechanisms for exception handling and cancellation points.
+///
+/// Defined outside OpenMPIRBuilder because nested classes cannot be
+/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
class CanonicalLoopInfo {
friend class OpenMPIRBuilder;
private:
- /// Whether this object currently represents a loop.
- bool IsValid = false;
-
- BasicBlock *Preheader;
- BasicBlock *Header;
- BasicBlock *Cond;
- BasicBlock *Body;
- BasicBlock *Latch;
- BasicBlock *Exit;
- BasicBlock *After;
+ BasicBlock *Preheader = nullptr;
+ BasicBlock *Header = nullptr;
+ BasicBlock *Cond = nullptr;
+ BasicBlock *Body = nullptr;
+ BasicBlock *Latch = nullptr;
+ BasicBlock *Exit = nullptr;
+ BasicBlock *After = nullptr;
/// Add the control blocks of this loop to \p BBs.
///
/// This does not include any block from the body, including the one returned
/// by getBody().
+ ///
+ /// FIXME: This currently includes the Preheader and After blocks even though
+ /// their content is (mostly) not under CanonicalLoopInfo's control.
+ /// Re-evaluated whether this makes sense.
void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
public:
+ /// Returns whether this object currently represents the IR of a loop. If
+ /// returning false, it may have been consumed by a loop transformation or not
+ /// been intialized. Do not use in this case;
+ bool isValid() const { return Header; }
+
/// The preheader ensures that there is only a single edge entering the loop.
/// Code that must be execute before any loop iteration can be emitted here,
/// such as computing the loop trip count and begin lifetime markers. Code in
/// the preheader is not considered part of the canonical loop.
- BasicBlock *getPreheader() const { return Preheader; }
+ BasicBlock *getPreheader() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Preheader;
+ }
/// The header is the entry for each iteration. In the canonical control flow,
/// it only contains the PHINode for the induction variable.
- BasicBlock *getHeader() const { return Header; }
+ BasicBlock *getHeader() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Header;
+ }
/// The condition block computes whether there is another loop iteration. If
/// yes, branches to the body; otherwise to the exit block.
- BasicBlock *getCond() const { return Cond; }
+ BasicBlock *getCond() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Cond;
+ }
/// The body block is the single entry for a loop iteration and not controlled
/// by CanonicalLoopInfo. It can contain arbitrary control flow but must
/// eventually branch to the \p Latch block.
- BasicBlock *getBody() const { return Body; }
+ BasicBlock *getBody() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Body;
+ }
/// Reaching the latch indicates the end of the loop body code. In the
/// canonical control flow, it only contains the increment of the induction
/// variable.
- BasicBlock *getLatch() const { return Latch; }
+ BasicBlock *getLatch() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Latch;
+ }
/// Reaching the exit indicates no more iterations are being executed.
- BasicBlock *getExit() const { return Exit; }
+ BasicBlock *getExit() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Exit;
+ }
/// The after block is intended for clean-up code such as lifetime end
/// markers. It is separate from the exit block to ensure, analogous to the
/// preheader, it having just a single entry edge and being free from PHI
/// nodes should there be multiple loop exits (such as from break
/// statements/cancellations).
- BasicBlock *getAfter() const { return After; }
+ BasicBlock *getAfter() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return After;
+ }
/// Returns the llvm::Value containing the number of loop iterations. It must
/// be valid in the preheader and always interpreted as an unsigned integer of
/// any bit-width.
Value *getTripCount() const {
+ assert(isValid() && "Requires a valid canonical loop");
Instruction *CmpI = &Cond->front();
assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
return CmpI->getOperand(1);
@@ -1216,33 +1500,47 @@ public:
/// Returns the instruction representing the current logical induction
/// variable. Always unsigned, always starting at 0 with an increment of one.
Instruction *getIndVar() const {
+ assert(isValid() && "Requires a valid canonical loop");
Instruction *IndVarPHI = &Header->front();
assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
return IndVarPHI;
}
/// Return the type of the induction variable (and the trip count).
- Type *getIndVarType() const { return getIndVar()->getType(); }
+ Type *getIndVarType() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return getIndVar()->getType();
+ }
/// Return the insertion point for user code before the loop.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
+ assert(isValid() && "Requires a valid canonical loop");
return {Preheader, std::prev(Preheader->end())};
};
/// Return the insertion point for user code in the body.
OpenMPIRBuilder::InsertPointTy getBodyIP() const {
+ assert(isValid() && "Requires a valid canonical loop");
return {Body, Body->begin()};
};
/// Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const {
+ assert(isValid() && "Requires a valid canonical loop");
return {After, After->begin()};
};
- Function *getFunction() const { return Header->getParent(); }
+ Function *getFunction() const {
+ assert(isValid() && "Requires a valid canonical loop");
+ return Header->getParent();
+ }
/// Consistency self-check.
void assertOK() const;
+
+ /// Invalidate this loop. That is, the underlying IR does not fulfill the
+ /// requirements of an OpenMP canonical loop anymore.
+ void invalidate();
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index eb673b199fc4..8e4f7568fb9c 100644
--- a/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/contrib/llvm-project/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -39,7 +39,6 @@ __OMP_TYPE(Int32Ptr)
__OMP_TYPE(Int64Ptr)
OMP_TYPE(SizeTy, M.getDataLayout().getIntPtrType(Ctx))
-OMP_TYPE(LanemaskTy, getLanemaskType())
#define __OMP_PTR_TYPE(NAME, BASE) OMP_TYPE(NAME, BASE->getPointerTo())
@@ -272,6 +271,15 @@ __OMP_RTL(__kmpc_for_static_init_8, false, Void, IdentPtr, Int32, Int32,
__OMP_RTL(__kmpc_for_static_init_8u, false, Void, IdentPtr, Int32, Int32,
Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
__OMP_RTL(__kmpc_for_static_fini, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_4, false, Void, IdentPtr, Int32, Int32,
+ Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_4u, false, Void, IdentPtr, Int32, Int32,
+ Int32Ptr, Int32Ptr, Int32Ptr, Int32Ptr, Int32, Int32)
+__OMP_RTL(__kmpc_distribute_static_init_8, false, Void, IdentPtr, Int32, Int32,
+ Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_init_8u, false, Void, IdentPtr, Int32, Int32,
+ Int32Ptr, Int64Ptr, Int64Ptr, Int64Ptr, Int64, Int64)
+__OMP_RTL(__kmpc_distribute_static_fini, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_dist_dispatch_init_4, false, Void, IdentPtr, Int32, Int32,
Int32Ptr, Int32, Int32, Int32, Int32)
__OMP_RTL(__kmpc_dist_dispatch_init_4u, false, Void, IdentPtr, Int32, Int32,
@@ -415,8 +423,8 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
/* Int */ Int32, /* kmp_task_t */ VoidPtr)
/// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1)
-__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1)
+__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1, Int1)
+__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8, Int1)
__OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
__OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
VoidPtr, VoidPtr, VoidPtrPtr, SizeTy)
@@ -442,9 +450,12 @@ __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
__OMP_RTL(__kmpc_parallel_level, false, Int8, )
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
+__OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
-__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
-__OMP_RTL(__kmpc_syncwarp, false, Void, LanemaskTy)
+__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
+__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
+
+__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
__OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
@@ -510,6 +521,11 @@ __OMP_ATTRS_SET(NoCaptureAttrs,
? AttributeSet(EnumAttr(NoCapture))
: AttributeSet(EnumAttr(NoCapture)))
+__OMP_ATTRS_SET(AlwaysInlineAttrs,
+ OptimisticAttributes
+ ? AttributeSet(EnumAttr(AlwaysInline))
+ : AttributeSet(EnumAttr(AlwaysInline)))
+
#if 0
__OMP_ATTRS_SET(InaccessibleOnlyAttrs,
OptimisticAttributes
@@ -535,6 +551,11 @@ __OMP_ATTRS_SET(ReadOnlyPtrAttrs,
EnumAttr(NoCapture))
: AttributeSet())
+__OMP_ATTRS_SET(DeviceAllocAttrs,
+ OptimisticAttributes
+ ? AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync))
+ : AttributeSet(EnumAttr(NoUnwind), EnumAttr(NoSync)))
+
#if 0
__OMP_ATTRS_SET(WriteOnlyPtrAttrs,
OptimisticAttributes
@@ -575,6 +596,8 @@ __OMP_RTL_ATTRS(__kmpc_barrier, BarrierAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs))
__OMP_RTL_ATTRS(__kmpc_barrier_simple_spmd, BarrierAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_barrier_simple_generic, BarrierAttrs, AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs))
__OMP_RTL_ATTRS(__kmpc_warp_active_thread_mask, BarrierAttrs, AttributeSet(),
ParamAttrs())
__OMP_RTL_ATTRS(__kmpc_syncwarp, BarrierAttrs, AttributeSet(), ParamAttrs())
@@ -703,6 +726,28 @@ __OMP_RTL_ATTRS(__kmpc_for_static_init_8u, GetterArgWriteAttrs, AttributeSet(),
AttributeSet(), AttributeSet()))
__OMP_RTL_ATTRS(__kmpc_for_static_fini, InaccessibleArgOnlyAttrs,
AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4, GetterArgWriteAttrs,
+ AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+ AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_4u, GetterArgWriteAttrs,
+ AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+ AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8, GetterArgWriteAttrs,
+ AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+ AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_init_8u, GetterArgWriteAttrs,
+ AttributeSet(),
+ ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
+ ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs, ArgPtrAttrs,
+ AttributeSet(), AttributeSet()))
+__OMP_RTL_ATTRS(__kmpc_distribute_static_fini, InaccessibleArgOnlyAttrs,
+ AttributeSet(), ParamAttrs(ReadOnlyPtrAttrs))
__OMP_RTL_ATTRS(__kmpc_dist_dispatch_init_4, GetterArgWriteAttrs,
AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs, AttributeSet(), AttributeSet(),
@@ -854,9 +899,9 @@ __OMP_RTL_ATTRS(__kmpc_doacross_wait, BarrierAttrs, AttributeSet(),
__OMP_RTL_ATTRS(__kmpc_doacross_fini, BarrierAttrs, AttributeSet(),
ParamAttrs(ReadOnlyPtrAttrs))
-__OMP_RTL_ATTRS(__kmpc_alloc_shared, DefaultAttrs, ReturnPtrAttrs,
+__OMP_RTL_ATTRS(__kmpc_alloc_shared, DeviceAllocAttrs, ReturnPtrAttrs,
ParamAttrs())
-__OMP_RTL_ATTRS(__kmpc_free_shared, AllocAttrs, AttributeSet(),
+__OMP_RTL_ATTRS(__kmpc_free_shared, DeviceAllocAttrs, AttributeSet(),
ParamAttrs(NoCaptureAttrs))
__OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
@@ -897,6 +942,9 @@ __OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(),
__OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs))
+__OMP_RTL_ATTRS(__kmpc_parallel_51, AlwaysInlineAttrs, AttributeSet(),
+ ParamAttrs())
+
#undef __OMP_RTL_ATTRS
#undef OMP_RTL_ATTRS
#undef AttributeSet
@@ -920,6 +968,7 @@ __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
OMP_IDENT_FLAG(OMP_IDENT_FLAG_##Name, #Name, Value)
__OMP_IDENT_FLAG(KMPC, 0x02)
+__OMP_IDENT_FLAG(ATOMIC_REDUCE, 0x10)
__OMP_IDENT_FLAG(BARRIER_EXPL, 0x20)
__OMP_IDENT_FLAG(BARRIER_IMPL, 0x0040)
__OMP_IDENT_FLAG(BARRIER_IMPL_MASK, 0x01C0)