CGOpenMPRuntimeGPU.cpp source code [clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp]

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CodeGenFunction.h"
16	#include "clang/AST/Attr.h"
17	#include "clang/AST/DeclOpenMP.h"
18	#include "clang/AST/OpenMPClause.h"
19	#include "clang/AST/StmtOpenMP.h"
20	#include "clang/AST/StmtVisitor.h"
21	#include "clang/Basic/Cuda.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24	#include "llvm/Support/MathExtras.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28	using namespace llvm::omp;
29
30	namespace {
31	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32	class NVPTXActionTy final : public PrePostActionTy {
33	llvm::FunctionCallee EnterCallee = nullptr;
34	ArrayRef<llvm::Value *> EnterArgs;
35	llvm::FunctionCallee ExitCallee = nullptr;
36	ArrayRef<llvm::Value *> ExitArgs;
37	bool Conditional = false;
38	llvm::BasicBlock ContBlock = nullptr*;
39
40	public:
41	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42	ArrayRef<llvm::Value *> EnterArgs,
43	llvm::FunctionCallee ExitCallee,
44	ArrayRef<llvm::Value > ExitArgs, bool* Conditional = false)
45	: EnterCallee (EnterCallee), EnterArgs (EnterArgs), ExitCallee (ExitCallee),
46	ExitArgs (ExitArgs), Conditional(Conditional) {}
47	void Enter(CodeGenFunction &CGF) override {
48	llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49	if (Conditional) {
50	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51	auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52	ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53	// Generate the branch (If-stmt)
54	CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55	CGF.EmitBlock(BB: ThenBlock);
56	}
57	}
58	void Done(CodeGenFunction &CGF) {
59	// Emit the rest of blocks/branches
60	CGF.EmitBranch(Block: ContBlock);
61	CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62	}
63	void Exit(CodeGenFunction &CGF) override {
64	CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65	}
66	};
67
68	/// A class to track the execution mode when codegening directives within
69	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
70	/// to the target region and used by containing directives such as 'parallel'
71	/// to emit optimized code.
72	class ExecutionRuntimeModesRAII {
73	private:
74	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75	CGOpenMPRuntimeGPU::EM_Unknown;
76	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78	public:
79	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80	CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81	: ExecMode(ExecMode) {
82	SavedExecMode = ExecMode;
83	ExecMode = EntryMode;
84	}
85	~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86	};
87
88	static const ValueDecl getPrivateItem(const* Expr *RefExpr) {
89	RefExpr = RefExpr->IgnoreParens();
90	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93	Base = TempASE->getBase()->IgnoreParenImpCasts();
94	RefExpr = Base;
95	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(Val: RefExpr)) {
96	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Val: Base))
98	Base = TempOASE->getBase()->IgnoreParenImpCasts();
99	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100	Base = TempASE->getBase()->IgnoreParenImpCasts();
101	RefExpr = Base;
102	}
103	RefExpr = RefExpr->IgnoreParenImpCasts();
104	if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106	const auto *ME = cast<MemberExpr>(Val: RefExpr);
107	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108	}
109
110	static RecordDecl *buildRecordForGlobalizedVars(
111	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
114	&MappedDeclsFields,
115	int BufSize) {
116	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
117	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118	return nullptr;
119	SmallVector<VarsDataTy, `4`> GlobalizedVars;
120	for (const ValueDecl *D : EscapedDecls)
121	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122	for (const ValueDecl *D : EscapedDeclsForTeams)
123	GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125	// Build struct _globalized_locals_ty {
126	// / globalized vars /[WarSize] align (decl_align)
127	// / globalized vars / for EscapedDeclsForTeams
128	// };
129	RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130	GlobalizedRD->startDefinition();
131	llvm::SmallPtrSet<const ValueDecl *, `16`> SingleEscaped(
132	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133	for (const auto &Pair : GlobalizedVars) {
134	const ValueDecl *VD = Pair.second;
135	QualType Type = VD->getType();
136	if (Type ->isLValueReferenceType())
137	Type = C.getPointerType(T: Type.getNonReferenceType());
138	else
139	Type = Type.getNonReferenceType();
140	SourceLocation Loc = VD->getLocation();
141	FieldDecl *Field;
142	if (SingleEscaped.count(Ptr: VD)) {
143	Field = FieldDecl::Create(
144	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
146	/BW=/nullptr, /Mutable=/false,
147	/InitStyle=/ICIS_NoInit);
148	Field->setAccess(AS_public);
149	if (VD->hasAttrs()) {
150	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151	E(VD->getAttrs().end());
152	I != E; ++I)
153	Field->addAttr(*I);
154	}
155	} else {
156	if (BufSize > `1`) {
157	llvm::APInt ArraySize(`32`, BufSize);
158	Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159	ASM: ArraySizeModifier::Normal, IndexTypeQuals: `0`);
160	}
161	Field = FieldDecl::Create(
162	C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163	TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation ()),
164	/BW=/nullptr, /Mutable=/false,
165	/InitStyle=/ICIS_NoInit);
166	Field->setAccess(AS_public);
167	llvm::APInt Align(`32`, Pair.first.getQuantity());
168	Field->addAttr(AlignedAttr::CreateImplicit(
169	C, /IsAlignmentExpr=/true,
170	IntegerLiteral::Create(C, Align,
171	C.getIntTypeForBitwidth(`32`, /Signed=/`0`),
172	SourceLocation()),
173	{}, AlignedAttr::GNU_aligned));
174	}
175	GlobalizedRD->addDecl(Field);
176	MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177	}
178	GlobalizedRD->completeDefinition();
179	return GlobalizedRD;
180	}
181
182	/// Get the list of variables that can escape their declaration context.
183	class CheckVarsEscapingDeclContext final
184	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185	CodeGenFunction &CGF;
186	llvm::SetVector<const ValueDecl *> EscapedDecls;
187	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188	llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189	llvm::SmallPtrSet<const Decl *, `4`> EscapedParameters;
190	RecordDecl GlobalizedRD = nullptr*;
191	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
192	bool AllEscaped = false;
193	bool IsForCombinedParallelRegion = false;
194
195	void markAsEscaped(const ValueDecl *VD) {
196	// Do not globalize declare target variables.
197	if (!isa<VarDecl>(VD) \|\|
198	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199	return;
200	VD = cast<ValueDecl>(VD->getCanonicalDecl());
201	// Use user-specified allocation.
202	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203	return;
204	// Variables captured by value must be globalized.
205	bool IsCaptured = false;
206	if (auto *CSI = CGF.CapturedStmtInfo) {
207	if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208	// Check if need to capture the variable that was already captured by
209	// value in the outer region.
210	IsCaptured = true;
211	if (!IsForCombinedParallelRegion) {
212	if (!FD->hasAttrs())
213	return;
214	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215	if (!Attr)
216	return;
217	if (((Attr->getCaptureKind() != OMPC_map) &&
218	!isOpenMPPrivate(Attr->getCaptureKind())) \|\|
219	((Attr->getCaptureKind() == OMPC_map) &&
220	!FD->getType()->isAnyPointerType()))
221	return;
222	}
223	if (!FD->getType()->isReferenceType()) {
224	assert(!VD->getType()->isVariablyModifiedType() &&
225	"Parameter captured by value with variably modified type");
226	EscapedParameters.insert(VD);
227	} else if (!IsForCombinedParallelRegion) {
228	return;
229	}
230	}
231	}
232	if ((!CGF.CapturedStmtInfo \|\|
233	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234	VD->getType()->isReferenceType())
235	// Do not globalize variables with reference type.
236	return;
237	if (VD->getType()->isVariablyModifiedType()) {
238	// If not captured at the target region level then mark the escaped
239	// variable as delayed.
240	if (IsCaptured)
241	EscapedVariableLengthDecls.insert(X: VD);
242	else
243	DelayedVariableLengthDecls.insert(X: VD);
244	} else
245	EscapedDecls.insert(X: VD);
246	}
247
248	void VisitValueDecl(const ValueDecl *VD) {
249	if (VD->getType()->isLValueReferenceType())
250	markAsEscaped(VD);
251	if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252	if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253	const bool SavedAllEscaped = AllEscaped;
254	AllEscaped = VD->getType()->isLValueReferenceType();
255	Visit(VarD->getInit());
256	AllEscaped = SavedAllEscaped;
257	}
258	}
259	}
260	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261	ArrayRef<OMPClause *> Clauses,
262	bool IsCombinedParallelRegion) {
263	if (!S)
264	return;
265	for (const CapturedStmt::Capture &C : S->captures()) {
266	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267	const ValueDecl *VD = C.getCapturedVar();
268	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269	if (IsCombinedParallelRegion) {
270	// Check if the variable is privatized in the combined construct and
271	// those private copies must be shared in the inner parallel
272	// directive.
273	IsForCombinedParallelRegion = false;
274	for (const OMPClause *C : Clauses) {
275	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
276	C->getClauseKind() == OMPC_reduction \|\|
277	C->getClauseKind() == OMPC_linear \|\|
278	C->getClauseKind() == OMPC_private)
279	continue;
280	ArrayRef<const Expr *> Vars;
281	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282	Vars = PC->getVarRefs();
283	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284	Vars = PC->getVarRefs();
285	else
286	llvm_unreachable("Unexpected clause.");
287	for (const auto *E : Vars) {
288	const Decl *D =
289	cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290	if (D == VD->getCanonicalDecl()) {
291	IsForCombinedParallelRegion = true;
292	break;
293	}
294	}
295	if (IsForCombinedParallelRegion)
296	break;
297	}
298	}
299	markAsEscaped(VD);
300	if (isa<OMPCapturedExprDecl>(Val: VD))
301	VisitValueDecl(VD);
302	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303	}
304	}
305	}
306
307	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308	assert(!GlobalizedRD &&
309	"Record for globalized variables is built already.");
310	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312	if (IsInTTDRegion)
313	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314	else
315	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316	GlobalizedRD = ::buildRecordForGlobalizedVars(
317	C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318	MappedDeclsFields, BufSize: WarpSize);
319	}
320
321	public:
322	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323	ArrayRef<const ValueDecl *> TeamsReductions)
324	: CGF(CGF), EscapedDecls (TeamsReductions.begin(), TeamsReductions.end()) {
325	}
326	virtual ~CheckVarsEscapingDeclContext() = default;
327	void VisitDeclStmt(const DeclStmt *S) {
328	if (!S)
329	return;
330	for (const Decl *D : S->decls())
331	if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332	VisitValueDecl(VD);
333	}
334	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335	if (!D)
336	return;
337	if (!D->hasAssociatedStmt())
338	return;
339	if (const auto *S =
340	dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341	// Do not analyze directives that do not actually require capturing,
342	// like `omp for` or `omp simd` directives.
343	llvm::SmallVector<OpenMPDirectiveKind, `4`> CaptureRegions;
344	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345	if (CaptureRegions.size() == `1` && CaptureRegions.back() == OMPD_unknown) {
346	VisitStmt(S: S->getCapturedStmt());
347	return;
348	}
349	VisitOpenMPCapturedStmt(
350	S, D->clauses(),
351	CaptureRegions.back() == OMPD_parallel &&
352	isOpenMPDistributeDirective(D->getDirectiveKind()));
353	}
354	}
355	void VisitCapturedStmt(const CapturedStmt *S) {
356	if (!S)
357	return;
358	for (const CapturedStmt::Capture &C : S->captures()) {
359	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360	const ValueDecl *VD = C.getCapturedVar();
361	markAsEscaped(VD);
362	if (isa<OMPCapturedExprDecl>(Val: VD))
363	VisitValueDecl(VD);
364	}
365	}
366	}
367	void VisitLambdaExpr(const LambdaExpr *E) {
368	if (!E)
369	return;
370	for (const LambdaCapture &C : E->captures()) {
371	if (C.capturesVariable()) {
372	if (C.getCaptureKind() == LCK_ByRef) {
373	const ValueDecl *VD = C.getCapturedVar();
374	markAsEscaped(VD);
375	if (E->isInitCapture(Capture: &C) \|\| isa<OMPCapturedExprDecl>(Val: VD))
376	VisitValueDecl(VD);
377	}
378	}
379	}
380	}
381	void VisitBlockExpr(const BlockExpr *E) {
382	if (!E)
383	return;
384	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385	if (C.isByRef()) {
386	const VarDecl *VD = C.getVariable();
387	markAsEscaped(VD);
388	if (isa<OMPCapturedExprDecl>(Val: VD) \|\| VD->isInitCapture())
389	VisitValueDecl(VD);
390	}
391	}
392	}
393	void VisitCallExpr(const CallExpr *E) {
394	if (!E)
395	return;
396	for (const Expr *Arg : E->arguments()) {
397	if (!Arg)
398	continue;
399	if (Arg->isLValue()) {
400	const bool SavedAllEscaped = AllEscaped;
401	AllEscaped = true;
402	Visit(Arg);
403	AllEscaped = SavedAllEscaped;
404	} else {
405	Visit(Arg);
406	}
407	}
408	Visit(E->getCallee());
409	}
410	void VisitDeclRefExpr(const DeclRefExpr *E) {
411	if (!E)
412	return;
413	const ValueDecl *VD = E->getDecl();
414	if (AllEscaped)
415	markAsEscaped(VD);
416	if (isa<OMPCapturedExprDecl>(Val: VD))
417	VisitValueDecl(VD);
418	else if (VD->isInitCapture())
419	VisitValueDecl(VD);
420	}
421	void VisitUnaryOperator(const UnaryOperator *E) {
422	if (!E)
423	return;
424	if (E->getOpcode() == UO_AddrOf) {
425	const bool SavedAllEscaped = AllEscaped;
426	AllEscaped = true;
427	Visit(E->getSubExpr());
428	AllEscaped = SavedAllEscaped;
429	} else {
430	Visit(E->getSubExpr());
431	}
432	}
433	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434	if (!E)
435	return;
436	if (E->getCastKind() == CK_ArrayToPointerDecay) {
437	const bool SavedAllEscaped = AllEscaped;
438	AllEscaped = true;
439	Visit(E->getSubExpr());
440	AllEscaped = SavedAllEscaped;
441	} else {
442	Visit(E->getSubExpr());
443	}
444	}
445	void VisitExpr(const Expr *E) {
446	if (!E)
447	return;
448	bool SavedAllEscaped = AllEscaped;
449	if (!E->isLValue())
450	AllEscaped = false;
451	for (const Stmt *Child : E->children())
452	if (Child)
453	Visit(Child);
454	AllEscaped = SavedAllEscaped;
455	}
456	void VisitStmt(const Stmt *S) {
457	if (!S)
458	return;
459	for (const Stmt *Child : S->children())
460	if (Child)
461	Visit(Child);
462	}
463
464	/// Returns the record that handles all the escaped local variables and used
465	/// instead of their original storage.
466	const RecordDecl getGlobalizedRecord(bool* IsInTTDRegion) {
467	if (!GlobalizedRD)
468	buildRecordForGlobalizedVars(IsInTTDRegion);
469	return GlobalizedRD;
470	}
471
472	/// Returns the field in the globalized record for the escaped variable.
473	const FieldDecl getFieldForGlobalizedVar(const* ValueDecl VD) const* {
474	assert(GlobalizedRD &&
475	"Record for globalized variables must be generated already.");
476	return MappedDeclsFields.lookup(Val: VD);
477	}
478
479	/// Returns the list of the escaped local variables/parameters.
480	ArrayRef<const ValueDecl > getEscapedDecls() const* {
481	return EscapedDecls.getArrayRef();
482	}
483
484	/// Checks if the escaped local variable is actually a parameter passed by
485	/// value.
486	const llvm::SmallPtrSetImpl<const Decl > &getEscapedParameters() const* {
487	return EscapedParameters;
488	}
489
490	/// Returns the list of the escaped variables with the variably modified
491	/// types.
492	ArrayRef<const ValueDecl > getEscapedVariableLengthDecls() const* {
493	return EscapedVariableLengthDecls.getArrayRef();
494	}
495
496	/// Returns the list of the delayed variables with the variably modified
497	/// types.
498	ArrayRef<const ValueDecl > getDelayedVariableLengthDecls() const* {
499	return DelayedVariableLengthDecls.getArrayRef();
500	}
501	};
502	} // anonymous namespace
503
504	/// Get the id of the warp in the block.
505	/// We assume that the warp size is 32, which is always the case
506	/// on the NVPTX device, to generate more efficient code.
507	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508	CGBuilderTy &Bld = CGF.Builder;
509	unsigned LaneIDBits =
510	llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
511	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512	return Bld.CreateAShr(LHS: RT.getGPUThreadID(CGF), RHS: LaneIDBits, Name: "nvptx_warp_id");
513	}
514
515	/// Get the id of the current lane in the Warp.
516	/// We assume that the warp size is 32, which is always the case
517	/// on the NVPTX device, to generate more efficient code.
518	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519	CGBuilderTy &Bld = CGF.Builder;
520	unsigned LaneIDBits =
521	llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
522	assert(LaneIDBits < `32` && "Invalid LaneIDBits size in NVPTX device.");
523	unsigned LaneIDMask = ~`0u` >> (`32u` - LaneIDBits);
524	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525	return Bld.CreateAnd(LHS: RT.getGPUThreadID(CGF), RHS: Bld.getInt32(C: LaneIDMask),
526	Name: "nvptx_lane_id");
527	}
528
529	CGOpenMPRuntimeGPU::ExecutionMode
530	CGOpenMPRuntimeGPU::getExecutionMode() const {
531	return CurrentExecutionMode;
532	}
533
534	CGOpenMPRuntimeGPU::DataSharingMode
535	CGOpenMPRuntimeGPU::getDataSharingMode() const {
536	return CurrentDataSharingMode;
537	}
538
539	/// Check for inner (nested) SPMD construct, if any
540	static bool hasNestedSPMDDirective(ASTContext &Ctx,
541	const OMPExecutableDirective &D) {
542	const auto *CS = D.getInnermostCapturedStmt();
543	const auto *Body =
544	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
545	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547	if (const auto *NestedDir =
548	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
549	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550	switch (D.getDirectiveKind()) {
551	case OMPD_target:
552	if (isOpenMPParallelDirective(DKind))
553	return true;
554	if (DKind == OMPD_teams) {
555	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556	/IgnoreCaptured=/true);
557	if (!Body)
558	return false;
559	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560	if (const auto *NND =
561	dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
562	DKind = NND->getDirectiveKind();
563	if (isOpenMPParallelDirective(DKind))
564	return true;
565	}
566	}
567	return false;
568	case OMPD_target_teams:
569	return isOpenMPParallelDirective(DKind);
570	case OMPD_target_simd:
571	case OMPD_target_parallel:
572	case OMPD_target_parallel_for:
573	case OMPD_target_parallel_for_simd:
574	case OMPD_target_teams_distribute:
575	case OMPD_target_teams_distribute_simd:
576	case OMPD_target_teams_distribute_parallel_for:
577	case OMPD_target_teams_distribute_parallel_for_simd:
578	case OMPD_parallel:
579	case OMPD_for:
580	case OMPD_parallel_for:
581	case OMPD_parallel_master:
582	case OMPD_parallel_sections:
583	case OMPD_for_simd:
584	case OMPD_parallel_for_simd:
585	case OMPD_cancel:
586	case OMPD_cancellation_point:
587	case OMPD_ordered:
588	case OMPD_threadprivate:
589	case OMPD_allocate:
590	case OMPD_task:
591	case OMPD_simd:
592	case OMPD_sections:
593	case OMPD_section:
594	case OMPD_single:
595	case OMPD_master:
596	case OMPD_critical:
597	case OMPD_taskyield:
598	case OMPD_barrier:
599	case OMPD_taskwait:
600	case OMPD_taskgroup:
601	case OMPD_atomic:
602	case OMPD_flush:
603	case OMPD_depobj:
604	case OMPD_scan:
605	case OMPD_teams:
606	case OMPD_target_data:
607	case OMPD_target_exit_data:
608	case OMPD_target_enter_data:
609	case OMPD_distribute:
610	case OMPD_distribute_simd:
611	case OMPD_distribute_parallel_for:
612	case OMPD_distribute_parallel_for_simd:
613	case OMPD_teams_distribute:
614	case OMPD_teams_distribute_simd:
615	case OMPD_teams_distribute_parallel_for:
616	case OMPD_teams_distribute_parallel_for_simd:
617	case OMPD_target_update:
618	case OMPD_declare_simd:
619	case OMPD_declare_variant:
620	case OMPD_begin_declare_variant:
621	case OMPD_end_declare_variant:
622	case OMPD_declare_target:
623	case OMPD_end_declare_target:
624	case OMPD_declare_reduction:
625	case OMPD_declare_mapper:
626	case OMPD_taskloop:
627	case OMPD_taskloop_simd:
628	case OMPD_master_taskloop:
629	case OMPD_master_taskloop_simd:
630	case OMPD_parallel_master_taskloop:
631	case OMPD_parallel_master_taskloop_simd:
632	case OMPD_requires:
633	case OMPD_unknown:
634	default:
635	llvm_unreachable("Unexpected directive.");
636	}
637	}
638
639	return false;
640	}
641
642	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
643	const OMPExecutableDirective &D) {
644	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645	switch (DirectiveKind) {
646	case OMPD_target:
647	case OMPD_target_teams:
648	return hasNestedSPMDDirective(Ctx, D);
649	case OMPD_target_teams_loop:
650	case OMPD_target_parallel_loop:
651	case OMPD_target_parallel:
652	case OMPD_target_parallel_for:
653	case OMPD_target_parallel_for_simd:
654	case OMPD_target_teams_distribute_parallel_for:
655	case OMPD_target_teams_distribute_parallel_for_simd:
656	case OMPD_target_simd:
657	case OMPD_target_teams_distribute_simd:
658	return true;
659	case OMPD_target_teams_distribute:
660	return false;
661	case OMPD_parallel:
662	case OMPD_for:
663	case OMPD_parallel_for:
664	case OMPD_parallel_master:
665	case OMPD_parallel_sections:
666	case OMPD_for_simd:
667	case OMPD_parallel_for_simd:
668	case OMPD_cancel:
669	case OMPD_cancellation_point:
670	case OMPD_ordered:
671	case OMPD_threadprivate:
672	case OMPD_allocate:
673	case OMPD_task:
674	case OMPD_simd:
675	case OMPD_sections:
676	case OMPD_section:
677	case OMPD_single:
678	case OMPD_master:
679	case OMPD_critical:
680	case OMPD_taskyield:
681	case OMPD_barrier:
682	case OMPD_taskwait:
683	case OMPD_taskgroup:
684	case OMPD_atomic:
685	case OMPD_flush:
686	case OMPD_depobj:
687	case OMPD_scan:
688	case OMPD_teams:
689	case OMPD_target_data:
690	case OMPD_target_exit_data:
691	case OMPD_target_enter_data:
692	case OMPD_distribute:
693	case OMPD_distribute_simd:
694	case OMPD_distribute_parallel_for:
695	case OMPD_distribute_parallel_for_simd:
696	case OMPD_teams_distribute:
697	case OMPD_teams_distribute_simd:
698	case OMPD_teams_distribute_parallel_for:
699	case OMPD_teams_distribute_parallel_for_simd:
700	case OMPD_target_update:
701	case OMPD_declare_simd:
702	case OMPD_declare_variant:
703	case OMPD_begin_declare_variant:
704	case OMPD_end_declare_variant:
705	case OMPD_declare_target:
706	case OMPD_end_declare_target:
707	case OMPD_declare_reduction:
708	case OMPD_declare_mapper:
709	case OMPD_taskloop:
710	case OMPD_taskloop_simd:
711	case OMPD_master_taskloop:
712	case OMPD_master_taskloop_simd:
713	case OMPD_parallel_master_taskloop:
714	case OMPD_parallel_master_taskloop_simd:
715	case OMPD_requires:
716	case OMPD_unknown:
717	default:
718	break;
719	}
720	llvm_unreachable(
721	"Unknown programming model for OpenMP directive on NVPTX target.");
722	}
723
724	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
725	StringRef ParentName,
726	llvm::Function *&OutlinedFn,
727	llvm::Constant *&OutlinedFnID,
728	bool IsOffloadEntry,
729	const RegionCodeGenTy &CodeGen) {
730	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
731	EntryFunctionState EST;
732	WrapperFunctionsMap.clear();
733
734	[[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
735	assert(!IsBareKernel && "bare kernel should not be at generic mode");
736
737	// Emit target region as a standalone region.
738	class NVPTXPrePostActionTy : public PrePostActionTy {
739	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
740	const OMPExecutableDirective &D;
741
742	public:
743	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
744	const OMPExecutableDirective &D)
745	: EST(EST), D(D) {}
746	void Enter(CodeGenFunction &CGF) override {
747	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
748	RT.emitKernelInit(D, CGF, EST, / IsSPMD / false);
749	// Skip target region initialization.
750	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
751	}
752	void Exit(CodeGenFunction &CGF) override {
753	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
754	RT.clearLocThreadIdInsertPt(CGF);
755	RT.emitKernelDeinit(CGF, EST, / IsSPMD / false);
756	}
757	} Action(EST, D);
758	CodeGen.setAction(Action);
759	IsInTTDRegion = true;
760	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
761	IsOffloadEntry, CodeGen);
762	IsInTTDRegion = false;
763	}
764
765	void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
766	CodeGenFunction &CGF,
767	EntryFunctionState &EST, bool IsSPMD) {
768	int32_t MinThreadsVal = `1`, MaxThreadsVal = -`1`, MinTeamsVal = `1`,
769	MaxTeamsVal = -`1`;
770	computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
771	MinTeamsVal, MaxTeamsVal);
772
773	CGBuilderTy &Bld = CGF.Builder;
774	Bld.restoreIP(IP: OMPBuilder.createTargetInit(
775	Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
776	if (!IsSPMD)
777	emitGenericVarsProlog(CGF, Loc: EST.Loc);
778	}
779
780	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
781	EntryFunctionState &EST,
782	bool IsSPMD) {
783	if (!IsSPMD)
784	emitGenericVarsEpilog(CGF);
785
786	// This is temporary until we remove the fixed sized buffer.
787	ASTContext &C = CGM.getContext();
788	RecordDecl *StaticRD = C.buildImplicitRecord(
789	Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
790	StaticRD->startDefinition();
791	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
792	QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
793	auto *Field = FieldDecl::Create(
794	C, StaticRD, SourceLocation (), SourceLocation (), nullptr, RecTy,
795	C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation ()),
796	/BW=/nullptr, /Mutable=/false,
797	/InitStyle=/ICIS_NoInit);
798	Field->setAccess(AS_public);
799	StaticRD->addDecl(D: Field);
800	}
801	StaticRD->completeDefinition();
802	QualType StaticTy = C.getRecordType(Decl: StaticRD);
803	llvm::Type *LLVMReductionsBufferTy =
804	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
805	const auto &DL = CGM.getModule().getDataLayout();
806	uint64_t ReductionDataSize =
807	TeamsReductions.empty()
808	? `0`
809	: DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
810	CGBuilderTy &Bld = CGF.Builder;
811	OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
812	TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
813	TeamsReductions.clear();
814	}
815
816	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
817	StringRef ParentName,
818	llvm::Function *&OutlinedFn,
819	llvm::Constant *&OutlinedFnID,
820	bool IsOffloadEntry,
821	const RegionCodeGenTy &CodeGen) {
822	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
823	EntryFunctionState EST;
824
825	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
826
827	// Emit target region as a standalone region.
828	class NVPTXPrePostActionTy : public PrePostActionTy {
829	CGOpenMPRuntimeGPU &RT;
830	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
831	bool IsBareKernel;
832	DataSharingMode Mode;
833	const OMPExecutableDirective &D;
834
835	public:
836	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
837	CGOpenMPRuntimeGPU::EntryFunctionState &EST,
838	bool IsBareKernel, const OMPExecutableDirective &D)
839	: RT(RT), EST(EST), IsBareKernel(IsBareKernel),
840	Mode(RT.CurrentDataSharingMode), D(D) {}
841	void Enter(CodeGenFunction &CGF) override {
842	if (IsBareKernel) {
843	RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
844	return;
845	}
846	RT.emitKernelInit(D, CGF, EST, / IsSPMD / true);
847	// Skip target region initialization.
848	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
849	}
850	void Exit(CodeGenFunction &CGF) override {
851	if (IsBareKernel) {
852	RT.CurrentDataSharingMode = Mode;
853	return;
854	}
855	RT.clearLocThreadIdInsertPt(CGF);
856	RT.emitKernelDeinit(CGF, EST, / IsSPMD / true);
857	}
858	} Action(*this, EST, IsBareKernel, D);
859	CodeGen.setAction(Action);
860	IsInTTDRegion = true;
861	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
862	IsOffloadEntry, CodeGen);
863	IsInTTDRegion = false;
864	}
865
866	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
867	const OMPExecutableDirective &D, StringRef ParentName,
868	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
869	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
870	if (!IsOffloadEntry) // Nothing to do.
871	return;
872
873	assert(!ParentName.empty() && "Invalid target region parent name!");
874
875	bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
876	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
877	if (Mode \|\| IsBareKernel)
878	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
879	CodeGen);
880	else
881	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
882	CodeGen);
883	}
884
885	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
886	: CGOpenMPRuntime (CGM) {
887	llvm::OpenMPIRBuilderConfig Config(
888	CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
889	CGM.getLangOpts().OpenMPOffloadMandatory,
890	/HasRequiresReverseOffload/ false, /HasRequiresUnifiedAddress/ false,
891	hasRequiresUnifiedSharedMemory(), /HasRequiresDynamicAllocators/ false);
892	OMPBuilder.setConfig(Config);
893
894	if (!CGM.getLangOpts().OpenMPIsTargetDevice)
895	llvm_unreachable("OpenMP can only handle device code.");
896
897	if (CGM.getLangOpts().OpenMPCUDAMode)
898	CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
899
900	llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
901	if (CGM.getLangOpts().NoGPULib \|\| CGM.getLangOpts().OMPHostIRFile.empty())
902	return;
903
904	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
905	Name: "__omp_rtl_debug_kind");
906	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
907	Name: "__omp_rtl_assume_teams_oversubscription");
908	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
909	Name: "__omp_rtl_assume_threads_oversubscription");
910	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
911	Name: "__omp_rtl_assume_no_thread_state");
912	OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
913	Name: "__omp_rtl_assume_no_nested_parallelism");
914	}
915
916	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
917	ProcBindKind ProcBind,
918	SourceLocation Loc) {
919	// Nothing to do.
920	}
921
922	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
923	llvm::Value *NumThreads,
924	SourceLocation Loc) {
925	// Nothing to do.
926	}
927
928	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
929	const Expr *NumTeams,
930	const Expr *ThreadLimit,
931	SourceLocation Loc) {}
932
933	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
934	CodeGenFunction &CGF, const OMPExecutableDirective &D,
935	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
936	const RegionCodeGenTy &CodeGen) {
937	// Emit target region as a standalone region.
938	bool PrevIsInTTDRegion = IsInTTDRegion;
939	IsInTTDRegion = false;
940	auto *OutlinedFun =
941	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
942	CGF, D, ThreadIDVar, InnermostKind, CodeGen));
943	IsInTTDRegion = PrevIsInTTDRegion;
944	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
945	llvm::Function *WrapperFun =
946	createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
947	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
948	}
949
950	return OutlinedFun;
951	}
952
953	/// Get list of lastprivate variables from the teams distribute ... or
954	/// teams {distribute ...} directives.
955	static void
956	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
957	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
958	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
959	"expected teams directive.");
960	const OMPExecutableDirective *Dir = &D;
961	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
962	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
963	Ctx,
964	Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
965	/IgnoreCaptured=/true))) {
966	Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
967	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
968	Dir = nullptr;
969	}
970	}
971	if (!Dir)
972	return;
973	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
974	for (const Expr *E : C->getVarRefs())
975	Vars.push_back(getPrivateItem(E));
976	}
977	}
978
979	/// Get list of reduction variables from the teams ... directives.
980	static void
981	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
982	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
983	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
984	"expected teams directive.");
985	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
986	for (const Expr *E : C->privates())
987	Vars.push_back(Elt: getPrivateItem(RefExpr: E));
988	}
989	}
990
991	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
992	CodeGenFunction &CGF, const OMPExecutableDirective &D,
993	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
994	const RegionCodeGenTy &CodeGen) {
995	SourceLocation Loc = D.getBeginLoc();
996
997	const RecordDecl GlobalizedRD = nullptr*;
998	llvm::SmallVector<const ValueDecl *, `4`> LastPrivatesReductions;
999	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> MappedDeclsFields;
1000	unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1001	// Globalize team reductions variable unconditionally in all modes.
1002	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1003	getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1004	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1005	getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1006	if (!LastPrivatesReductions.empty()) {
1007	GlobalizedRD = ::buildRecordForGlobalizedVars(
1008	C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
1009	MappedDeclsFields, BufSize: WarpSize);
1010	}
1011	} else if (!LastPrivatesReductions.empty()) {
1012	assert(!TeamAndReductions.first &&
1013	"Previous team declaration is not expected.");
1014	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1015	std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
1016	}
1017
1018	// Emit target region as a standalone region.
1019	class NVPTXPrePostActionTy : public PrePostActionTy {
1020	SourceLocation &Loc;
1021	const RecordDecl *GlobalizedRD;
1022	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1023	&MappedDeclsFields;
1024
1025	public:
1026	NVPTXPrePostActionTy(
1027	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1028	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
1029	&MappedDeclsFields)
1030	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1031	MappedDeclsFields(MappedDeclsFields) {}
1032	void Enter(CodeGenFunction &CGF) override {
1033	auto &Rt =
1034	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1035	if (GlobalizedRD) {
1036	auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1037	I ->getSecond().MappedParams =
1038	std::make_unique<CodeGenFunction::OMPMapVars>();
1039	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
1040	for (const auto &Pair : MappedDeclsFields) {
1041	assert(Pair.getFirst()->isCanonicalDecl() &&
1042	"Expected canonical declaration");
1043	Data.insert(std::make_pair(x: Pair.getFirst(), y: MappedVarData ()));
1044	}
1045	}
1046	Rt.emitGenericVarsProlog(CGF, Loc);
1047	}
1048	void Exit(CodeGenFunction &CGF) override {
1049	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1050	.emitGenericVarsEpilog(CGF);
1051	}
1052	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1053	CodeGen.setAction(Action);
1054	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1055	CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1056
1057	return OutlinedFun;
1058	}
1059
1060	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1061	SourceLocation Loc) {
1062	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1063	return;
1064
1065	CGBuilderTy &Bld = CGF.Builder;
1066
1067	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1068	if (I == FunctionGlobalizedDecls.end())
1069	return;
1070
1071	for (auto &Rec : I ->getSecond().LocalVarData) {
1072	const auto *VD = cast<VarDecl>(Val: Rec.first);
1073	bool EscapedParam = I ->getSecond().EscapedParameters.count(Ptr: Rec.first);
1074	QualType VarTy = VD->getType();
1075
1076	// Get the local allocation of a firstprivate variable before sharing
1077	llvm::Value *ParValue;
1078	if (EscapedParam) {
1079	LValue ParLVal =
1080	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1081	ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1082	}
1083
1084	// Allocate space for the variable to be globalized
1085	llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1086	llvm::CallBase *VoidPtr =
1087	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1088	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1089	AllocArgs, VD->getName());
1090	// FIXME: We should use the variables actual alignment as an argument.
1091	VoidPtr->addRetAttr(llvm::Attribute::get(
1092	CGM.getLLVMContext(), llvm::Attribute::Alignment,
1093	CGM.getContext().getTargetInfo().getNewAlign() / `8`));
1094
1095	// Cast the void pointer and get the address of the globalized variable.
1096	llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1097	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1098	VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1099	LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(V: CastedVoidPtr, T: VarTy);
1100	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1101	Rec.second.GlobalizedVal = VoidPtr;
1102
1103	// Assign the local allocation to the newly globalized location.
1104	if (EscapedParam) {
1105	CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1106	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress(CGF));
1107	}
1108	if (auto *DI = CGF.getDebugInfo())
1109	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1110	}
1111
1112	for (const auto *ValueD : I ->getSecond().EscapedVariableLengthDecls) {
1113	const auto *VD = cast<VarDecl>(Val: ValueD);
1114	std::pair<llvm::Value , llvm::Value > AddrSizePair =
1115	getKmpcAllocShared(CGF, VD);
1116	I ->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1117	LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1118	CGM.getContext().getDeclAlign(VD),
1119	AlignmentSource::Decl);
1120	I ->getSecond().MappedParams ->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress(CGF));
1121	}
1122	I ->getSecond().MappedParams ->apply(CGF);
1123	}
1124
1125	bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1126	const VarDecl VD) const* {
1127	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1128	if (I == FunctionGlobalizedDecls.end())
1129	return false;
1130
1131	// Check variable declaration is delayed:
1132	return llvm::is_contained(Range: I ->getSecond().DelayedVariableLengthDecls, Element: VD);
1133	}
1134
1135	std::pair<llvm::Value , llvm::Value >
1136	CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1137	const VarDecl *VD) {
1138	CGBuilderTy &Bld = CGF.Builder;
1139
1140	// Compute size and alignment.
1141	llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1142	CharUnits Align = CGM.getContext().getDeclAlign(VD);
1143	Size = Bld.CreateNUWAdd(
1144	LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - `1`));
1145	llvm::Value *AlignVal =
1146	llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1147	Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1148	Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1149
1150	// Allocate space for this VLA object to be globalized.
1151	llvm::Value *AllocArgs[] = {Size};
1152	llvm::CallBase *VoidPtr =
1153	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1154	M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1155	AllocArgs, VD->getName());
1156	VoidPtr->addRetAttr(llvm::Attribute::get(
1157	CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1158
1159	return std::make_pair(x&: VoidPtr, y&: Size);
1160	}
1161
1162	void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1163	CodeGenFunction &CGF,
1164	const std::pair<llvm::Value , llvm::Value > &AddrSizePair) {
1165	// Deallocate the memory for each globalized VLA object
1166	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1167	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1168	args: {AddrSizePair.first, AddrSizePair.second});
1169	}
1170
1171	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1172	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1173	return;
1174
1175	const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1176	if (I != FunctionGlobalizedDecls.end()) {
1177	// Deallocate the memory for each globalized VLA object that was
1178	// globalized in the prolog (i.e. emitGenericVarsProlog).
1179	for (const auto &AddrSizePair :
1180	llvm::reverse(C&: I ->getSecond().EscapedVariableLengthDeclsAddrs)) {
1181	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1182	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1183	args: {AddrSizePair.first, AddrSizePair.second});
1184	}
1185	// Deallocate the memory for each globalized value
1186	for (auto &Rec : llvm::reverse(C&: I ->getSecond().LocalVarData)) {
1187	const auto *VD = cast<VarDecl>(Val: Rec.first);
1188	I ->getSecond().MappedParams ->restore(CGF);
1189
1190	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1191	CGF.getTypeSize(Ty: VD->getType())};
1192	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193	M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1194	FreeArgs);
1195	}
1196	}
1197	}
1198
1199	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1200	const OMPExecutableDirective &D,
1201	SourceLocation Loc,
1202	llvm::Function *OutlinedFn,
1203	ArrayRef<llvm::Value *> CapturedVars) {
1204	if (!CGF.HaveInsertPoint())
1205	return;
1206
1207	bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1208
1209	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1210	/Name=/".zero.addr");
1211	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
1212	llvm::SmallVector<llvm::Value *, `16`> OutlinedFnArgs;
1213	// We don't emit any thread id function call in bare kernel, but because the
1214	// outlined function has a pointer argument, we emit a nullptr here.
1215	if (IsBareKernel)
1216	OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1217	else
1218	OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).getPointer());
1219	OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1220	OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1221	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1222	}
1223
1224	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1225	SourceLocation Loc,
1226	llvm::Function *OutlinedFn,
1227	ArrayRef<llvm::Value *> CapturedVars,
1228	const Expr *IfCond,
1229	llvm::Value *NumThreads) {
1230	if (!CGF.HaveInsertPoint())
1231	return;
1232
1233	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1234	NumThreads](CodeGenFunction &CGF,
1235	PrePostActionTy &Action) {
1236	CGBuilderTy &Bld = CGF.Builder;
1237	llvm::Value *NumThreadsVal = NumThreads;
1238	llvm::Function *WFn = WrapperFunctionsMap [OutlinedFn];
1239	llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1240	if (WFn)
1241	ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1242	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1243
1244	// Create a private scope that will globalize the arguments
1245	// passed from the outside of the target region.
1246	// TODO: Is that needed?
1247	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1248
1249	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1250	Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1251	Name: "captured_vars_addrs");
1252	// There's something to share.
1253	if (!CapturedVars.empty()) {
1254	// Prepare for parallel region. Indicate the outlined function.
1255	ASTContext &Ctx = CGF.getContext();
1256	unsigned Idx = `0`;
1257	for (llvm::Value *V : CapturedVars) {
1258	Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1259	llvm::Value *PtrV;
1260	if (V->getType()->isIntegerTy())
1261	PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1262	else
1263	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1264	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
1265	Ctx.getPointerType(Ctx.VoidPtrTy));
1266	++Idx;
1267	}
1268	}
1269
1270	llvm::Value IfCondVal = nullptr*;
1271	if (IfCond)
1272	IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1273	/ isSigned / false);
1274	else
1275	IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: `1`);
1276
1277	if (!NumThreadsVal)
1278	NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`);
1279	else
1280	NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1281
1282	assert(IfCondVal && "Expected a value");
1283	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1284	llvm::Value *Args[] = {
1285	RTLoc,
1286	getThreadID(CGF, Loc),
1287	IfCondVal,
1288	NumThreadsVal,
1289	llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -`1`),
1290	FnPtr,
1291	ID,
1292	Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.getPointer(),
1293	DestTy: CGF.VoidPtrPtrTy),
1294	llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1295	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296	M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1297	Args);
1298	};
1299
1300	RegionCodeGenTy RCG(ParallelGen);
1301	RCG (CGF);
1302	}
1303
1304	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1305	// Always emit simple barriers!
1306	if (!CGF.HaveInsertPoint())
1307	return;
1308	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1309	// This function does not use parameters, so we can emit just default values.
1310	llvm::Value *Args[] = {
1311	llvm::ConstantPointerNull::get(
1312	T: cast<llvm::PointerType>(getIdentTyPointerTy())),
1313	llvm::ConstantInt::get(Ty: CGF.Int32Ty, /V=/`0`, /isSigned=/IsSigned: true)};
1314	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1315	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1316	Args);
1317	}
1318
1319	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1320	SourceLocation Loc,
1321	OpenMPDirectiveKind Kind, bool,
1322	bool) {
1323	// Always emit simple barriers!
1324	if (!CGF.HaveInsertPoint())
1325	return;
1326	// Build call __kmpc_cancel_barrier(loc, thread_id);
1327	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1328	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1329	getThreadID(CGF, Loc)};
1330
1331	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1332	M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1333	Args);
1334	}
1335
1336	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1337	CodeGenFunction &CGF, StringRef CriticalName,
1338	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1339	const Expr *Hint) {
1340	llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1341	llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1342	llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1343	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1344	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1345
1346	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1347
1348	// Get the mask of active threads in the warp.
1349	llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1350	M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1351	// Fetch team-local id of the thread.
1352	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1353
1354	// Get the width of the team.
1355	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1356
1357	// Initialize the counter variable for the loop.
1358	QualType Int32Ty =
1359	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/`0`);
1360	Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1361	LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1362	CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1363	/isInit=/true);
1364
1365	// Block checks if loop counter exceeds upper bound.
1366	CGF.EmitBlock(BB: LoopBB);
1367	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1368	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1369	CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1370
1371	// Block tests which single thread should execute region, and which threads
1372	// should go straight to synchronisation point.
1373	CGF.EmitBlock(BB: TestBB);
1374	CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1375	llvm::Value *CmpThreadToCounter =
1376	CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1377	CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1378
1379	// Block emits the body of the critical region.
1380	CGF.EmitBlock(BB: BodyBB);
1381
1382	// Output the critical statement.
1383	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1384	Hint);
1385
1386	// After the body surrounded by the critical region, the single executing
1387	// thread will jump to the synchronisation point.
1388	// Block waits for all threads in current team to finish then increments the
1389	// counter variable and returns to the loop.
1390	CGF.EmitBlock(BB: SyncBB);
1391	// Reconverge active threads in the warp.
1392	(void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1393	M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1394	args: Mask);
1395
1396	llvm::Value *IncCounterVal =
1397	CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: `1`));
1398	CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1399	CGF.EmitBranch(Block: LoopBB);
1400
1401	// Block that is reached when all threads in the team complete the region.
1402	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
1403	}
1404
1405	/// Cast value to the specified type.
1406	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1407	QualType ValTy, QualType CastTy,
1408	SourceLocation Loc) {
1409	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1410	"Cast type must sized.");
1411	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1412	"Val type must sized.");
1413	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1414	if (ValTy == CastTy)
1415	return Val;
1416	if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1417	CGF.getContext().getTypeSizeInChars(T: CastTy))
1418	return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1419	if (CastTy ->isIntegerType() && ValTy ->isIntegerType())
1420	return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1421	isSigned: CastTy ->hasSignedIntegerRepresentation());
1422	Address CastItem = CGF.CreateMemTemp(T: CastTy);
1423	Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1424	CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /Volatile=/false, Ty: ValTy,
1425	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1426	TBAAInfo: TBAAAccessInfo ());
1427	return CGF.EmitLoadOfScalar(Addr: CastItem, /Volatile=/false, Ty: CastTy, Loc,
1428	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1429	TBAAInfo: TBAAAccessInfo ());
1430	}
1431
1432	/// This function creates calls to one of two shuffle functions to copy
1433	/// variables between lanes in a warp.
1434	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1435	llvm::Value *Elem,
1436	QualType ElemType,
1437	llvm::Value *Offset,
1438	SourceLocation Loc) {
1439	CodeGenModule &CGM = CGF.CGM;
1440	CGBuilderTy &Bld = CGF.Builder;
1441	CGOpenMPRuntimeGPU &RT =
1442	(static_cast<CGOpenMPRuntimeGPU >(&CGM.getOpenMPRuntime()));
1443	llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1444
1445	CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1446	assert(Size.getQuantity() <= `8` &&
1447	"Unsupported bitwidth in shuffle instruction.");
1448
1449	RuntimeFunction ShuffleFn = Size.getQuantity() <= `4`
1450	? OMPRTL___kmpc_shuffle_int32
1451	: OMPRTL___kmpc_shuffle_int64;
1452
1453	// Cast all types to 32- or 64-bit values before calling shuffle routines.
1454	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1455	DestWidth: Size.getQuantity() <= `4` ? `32` : `64`, /Signed=/`1`);
1456	llvm::Value *ElemCast = castValueToType(CGF, Val: Elem, ValTy: ElemType, CastTy, Loc);
1457	llvm::Value *WarpSize =
1458	Bld.CreateIntCast(V: RT.getGPUWarpSize(CGF), DestTy: CGM.Int16Ty, /isSigned=/true);
1459
1460	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1461	callee: OMPBuilder.getOrCreateRuntimeFunction(M&: CGM.getModule(), FnID: ShuffleFn),
1462	args: {ElemCast, Offset, WarpSize});
1463
1464	return castValueToType(CGF, Val: ShuffledVal, ValTy: CastTy, CastTy: ElemType, Loc);
1465	}
1466
1467	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1468	Address DestAddr, QualType ElemType,
1469	llvm::Value *Offset, SourceLocation Loc) {
1470	CGBuilderTy &Bld = CGF.Builder;
1471
1472	CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1473	// Create the loop over the big sized data.
1474	// ptr = (void)Elem;*
1475	// ptrEnd = (void) Elem + 1;*
1476	// Step = 8;
1477	// while (ptr + Step < ptrEnd)
1478	// shuffle((int64_t)ptr);*
1479	// Step = 4;
1480	// while (ptr + Step < ptrEnd)
1481	// shuffle((int32_t)ptr);*
1482	// ...
1483	Address ElemPtr = DestAddr;
1484	Address Ptr = SrcAddr;
1485	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1486	Addr: Bld.CreateConstGEP(Addr: SrcAddr, Index: `1`), Ty: CGF.VoidPtrTy, ElementTy: CGF.Int8Ty);
1487	for (int IntSize = `8`; IntSize >= `1`; IntSize /= `2`) {
1488	if (Size < CharUnits::fromQuantity(Quantity: IntSize))
1489	continue;
1490	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1491	DestWidth: CGF.getContext().toBits(CharSize: CharUnits::fromQuantity(Quantity: IntSize)),
1492	/Signed=/`1`);
1493	llvm::Type *IntTy = CGF.ConvertTypeForMem(T: IntType);
1494	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Ptr, Ty: IntTy->getPointerTo(),
1495	ElementTy: IntTy);
1496	ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1497	Addr: ElemPtr, Ty: IntTy->getPointerTo(), ElementTy: IntTy);
1498	if (Size.getQuantity() / IntSize > `1`) {
1499	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(name: ".shuffle.pre_cond");
1500	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".shuffle.then");
1501	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".shuffle.exit");
1502	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1503	CGF.EmitBlock(BB: PreCondBB);
1504	llvm::PHINode *PhiSrc =
1505	Bld.CreatePHI(Ty: Ptr.getType(), /NumReservedValues=/`2`);
1506	PhiSrc->addIncoming(V: Ptr.getPointer(), BB: CurrentBB);
1507	llvm::PHINode *PhiDest =
1508	Bld.CreatePHI(Ty: ElemPtr.getType(), /NumReservedValues=/`2`);
1509	PhiDest->addIncoming(V: ElemPtr.getPointer(), BB: CurrentBB);
1510	Ptr = Address (PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1511	ElemPtr =
1512	Address (PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1513	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1514	ElemTy: CGF.Int8Ty, LHS: PtrEnd.getPointer(),
1515	RHS: Bld.CreatePointerBitCastOrAddrSpaceCast(V: Ptr.getPointer(),
1516	DestTy: CGF.VoidPtrTy));
1517	Bld.CreateCondBr(Cond: Bld.CreateICmpSGT(LHS: PtrDiff, RHS: Bld.getInt64(C: IntSize - `1`)),
1518	True: ThenBB, False: ExitBB);
1519	CGF.EmitBlock(BB: ThenBB);
1520	llvm::Value *Res = createRuntimeShuffleFunction(
1521	CGF,
1522	Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /Volatile=/false, Ty: IntType, Loc,
1523	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1524	TBAAInfo: TBAAAccessInfo ()),
1525	ElemType: IntType, Offset, Loc);
1526	CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /Volatile=/false, Ty: IntType,
1527	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1528	TBAAInfo: TBAAAccessInfo ());
1529	Address LocalPtr = Bld.CreateConstGEP(Addr: Ptr, Index: `1`);
1530	Address LocalElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: `1`);
1531	PhiSrc->addIncoming(V: LocalPtr.getPointer(), BB: ThenBB);
1532	PhiDest->addIncoming(V: LocalElemPtr.getPointer(), BB: ThenBB);
1533	CGF.EmitBranch(Block: PreCondBB);
1534	CGF.EmitBlock(BB: ExitBB);
1535	} else {
1536	llvm::Value *Res = createRuntimeShuffleFunction(
1537	CGF,
1538	Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /Volatile=/false, Ty: IntType, Loc,
1539	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1540	TBAAInfo: TBAAAccessInfo ()),
1541	ElemType: IntType, Offset, Loc);
1542	CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /Volatile=/false, Ty: IntType,
1543	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1544	TBAAInfo: TBAAAccessInfo ());
1545	Ptr = Bld.CreateConstGEP(Addr: Ptr, Index: `1`);
1546	ElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: `1`);
1547	}
1548	Size = Size % IntSize;
1549	}
1550	}
1551
1552	namespace {
1553	enum CopyAction : unsigned {
1554	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1555	// the warp using shuffle instructions.
1556	RemoteLaneToThread,
1557	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1558	ThreadCopy,
1559	};
1560	} // namespace
1561
1562	struct CopyOptionsTy {
1563	llvm::Value *RemoteLaneOffset;
1564	llvm::Value *ScratchpadIndex;
1565	llvm::Value *ScratchpadWidth;
1566	};
1567
1568	/// Emit instructions to copy a Reduce list, which contains partially
1569	/// aggregated values, in the specified direction.
1570	static void emitReductionListCopy(
1571	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1572	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1573	CopyOptionsTy CopyOptions = {.RemoteLaneOffset: nullptr, .ScratchpadIndex: nullptr, .ScratchpadWidth: nullptr}) {
1574
1575	CodeGenModule &CGM = CGF.CGM;
1576	ASTContext &C = CGM.getContext();
1577	CGBuilderTy &Bld = CGF.Builder;
1578
1579	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1580
1581	// Iterates, element-by-element, through the source Reduce list and
1582	// make a copy.
1583	unsigned Idx = `0`;
1584	for (const Expr *Private : Privates) {
1585	Address SrcElementAddr = Address::invalid();
1586	Address DestElementAddr = Address::invalid();
1587	Address DestElementPtrAddr = Address::invalid();
1588	// Should we shuffle in an element from a remote lane?
1589	bool ShuffleInElement = false;
1590	// Set to true to update the pointer in the dest Reduce list to a
1591	// newly created element.
1592	bool UpdateDestListPtr = false;
1593	QualType PrivatePtrType = C.getPointerType(T: Private->getType());
1594	llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(T: PrivatePtrType);
1595
1596	switch (Action) {
1597	case RemoteLaneToThread: {
1598	// Step 1.1: Get the address for the src element in the Reduce list.
1599	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1600	SrcElementAddr = CGF.EmitLoadOfPointer(
1601	Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1602	PtrTy: PrivatePtrType ->castAs<PointerType>());
1603
1604	// Step 1.2: Create a temporary to store the element in the destination
1605	// Reduce list.
1606	DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1607	DestElementAddr =
1608	CGF.CreateMemTemp(T: Private->getType(), Name: ".omp.reduction.element");
1609	ShuffleInElement = true;
1610	UpdateDestListPtr = true;
1611	break;
1612	}
1613	case ThreadCopy: {
1614	// Step 1.1: Get the address for the src element in the Reduce list.
1615	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1616	SrcElementAddr = CGF.EmitLoadOfPointer(
1617	Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1618	PtrTy: PrivatePtrType ->castAs<PointerType>());
1619
1620	// Step 1.2: Get the address for dest element. The destination
1621	// element has already been created on the thread's stack.
1622	DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1623	DestElementAddr = CGF.EmitLoadOfPointer(
1624	Ptr: DestElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1625	PtrTy: PrivatePtrType ->castAs<PointerType>());
1626	break;
1627	}
1628	}
1629
1630	// Regardless of src and dest of copy, we emit the load of src
1631	// element as this is required in all directions
1632	SrcElementAddr = SrcElementAddr.withElementType(
1633	ElemTy: CGF.ConvertTypeForMem(T: Private->getType()));
1634	DestElementAddr =
1635	DestElementAddr.withElementType(ElemTy: SrcElementAddr.getElementType());
1636
1637	// Now that all active lanes have read the element in the
1638	// Reduce list, shuffle over the value from the remote lane.
1639	if (ShuffleInElement) {
1640	shuffleAndStore(CGF, SrcAddr: SrcElementAddr, DestAddr: DestElementAddr, ElemType: Private->getType(),
1641	Offset: RemoteLaneOffset, Loc: Private->getExprLoc());
1642	} else {
1643	switch (CGF.getEvaluationKind(T: Private->getType())) {
1644	case TEK_Scalar: {
1645	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1646	Addr: SrcElementAddr, /Volatile=/false, Ty: Private->getType(),
1647	Loc: Private->getExprLoc(), BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1648	TBAAInfo: TBAAAccessInfo ());
1649	// Store the source element value to the dest element address.
1650	CGF.EmitStoreOfScalar(
1651	Value: Elem, Addr: DestElementAddr, /Volatile=/false, Ty: Private->getType(),
1652	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
1653	break;
1654	}
1655	case TEK_Complex: {
1656	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1657	src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1658	loc: Private->getExprLoc());
1659	CGF.EmitStoreOfComplex(
1660	V: Elem, dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1661	/isInit=/false);
1662	break;
1663	}
1664	case TEK_Aggregate:
1665	CGF.EmitAggregateCopy(
1666	Dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1667	Src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1668	EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
1669	break;
1670	}
1671	}
1672
1673	// Step 3.1: Modify reference in dest Reduce list as needed.
1674	// Modifying the reference in Reduce list to point to the newly
1675	// created element. The element is live in the current function
1676	// scope and that of functions it invokes (i.e., reduce_function).
1677	// RemoteReduceData[i] = (void)&RemoteElem*
1678	if (UpdateDestListPtr) {
1679	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1680	V: DestElementAddr.getPointer(), DestTy: CGF.VoidPtrTy),
1681	DestElementPtrAddr, /Volatile=/false,
1682	C.VoidPtrTy);
1683	}
1684
1685	++Idx;
1686	}
1687	}
1688
1689	/// This function emits a helper that gathers Reduce lists from the first
1690	/// lane of every active warp to lanes in the first warp.
1691	///
1692	/// void inter_warp_copy_func(void reduce_data, num_warps)*
1693	/// shared smem[warp_size];
1694	/// For all data entries D in reduce_data:
1695	/// sync
1696	/// If (I am the first lane in each warp)
1697	/// Copy my local D to smem[warp_id]
1698	/// sync
1699	/// if (I am the first warp)
1700	/// Copy smem[thread_id] to my local D
1701	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1702	ArrayRef<const Expr *> Privates,
1703	QualType ReductionArrayTy,
1704	SourceLocation Loc) {
1705	ASTContext &C = CGM.getContext();
1706	llvm::Module &M = CGM.getModule();
1707
1708	// ReduceList: thread local Reduce list.
1709	// At the stage of the computation when this function is called, partially
1710	// aggregated values reside in the first lane of every active warp.
1711	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1712	C.VoidPtrTy, ImplicitParamKind::Other);
1713	// NumWarps: number of warps active in the parallel region. This could
1714	// be smaller than 32 (max warps in a CTA) for partial block reduction.
1715	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1716	C.getIntTypeForBitwidth(DestWidth: `32`, / Signed / true),
1717	ImplicitParamKind::Other);
1718	FunctionArgList Args;
1719	Args.push_back(&ReduceListArg);
1720	Args.push_back(&NumWarpsArg);
1721
1722	const CGFunctionInfo &CGFI =
1723	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1724	auto *Fn = llvm::Function::Create(Ty: CGM.getTypes().GetFunctionType(Info: CGFI),
1725	Linkage: llvm::GlobalValue::InternalLinkage,
1726	N: "_omp_reduction_inter_warp_copy_func", M: &M);
1727	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
1728	Fn->setDoesNotRecurse();
1729	CodeGenFunction CGF(CGM);
1730	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
1731
1732	CGBuilderTy &Bld = CGF.Builder;
1733
1734	// This array is used as a medium to transfer, one reduce element at a time,
1735	// the data from the first lane of every warp to lanes in the first warp
1736	// in order to perform the final step of a reduction in a parallel region
1737	// (reduction across warps). The array is placed in NVPTX __shared__ memory
1738	// for reduced latency, as well as to have a distinct copy for concurrently
1739	// executing target regions. The array is declared with common linkage so
1740	// as to be shared across compilation units.
1741	StringRef TransferMediumName =
1742	"__openmp_nvptx_data_transfer_temporary_storage";
1743	llvm::GlobalVariable *TransferMedium =
1744	M.getGlobalVariable(Name: TransferMediumName);
1745	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1746	if (!TransferMedium) {
1747	auto *Ty = llvm::ArrayType::get(ElementType: CGM.Int32Ty, NumElements: WarpSize);
1748	unsigned SharedAddressSpace = C.getTargetAddressSpace(AS: LangAS::cuda_shared);
1749	TransferMedium = new llvm::GlobalVariable (
1750	M, Ty, /isConstant=/false, llvm::GlobalVariable::WeakAnyLinkage,
1751	llvm::UndefValue::get(T: Ty), TransferMediumName,
1752	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
1753	SharedAddressSpace);
1754	CGM.addCompilerUsedGlobal(GV: TransferMedium);
1755	}
1756
1757	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1758	// Get the CUDA thread id of the current OpenMP thread on the GPU.
1759	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1760	// nvptx_lane_id = nvptx_id % warpsize
1761	llvm::Value *LaneID = getNVPTXLaneID(CGF);
1762	// nvptx_warp_id = nvptx_id / warpsize
1763	llvm::Value *WarpID = getNVPTXWarpID(CGF);
1764
1765	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1766	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
1767	Address LocalReduceList(
1768	Bld.CreatePointerBitCastOrAddrSpaceCast(
1769	CGF.EmitLoadOfScalar(
1770	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc,
1771	LValueBaseInfo (AlignmentSource::Type), TBAAAccessInfo ()),
1772	ElemTy->getPointerTo()),
1773	ElemTy, CGF.getPointerAlign());
1774
1775	unsigned Idx = `0`;
1776	for (const Expr *Private : Privates) {
1777	//
1778	// Warp master copies reduce element to transfer medium in __shared__
1779	// memory.
1780	//
1781	unsigned RealTySize =
1782	C.getTypeSizeInChars(T: Private->getType())
1783	.alignTo(Align: C.getTypeAlignInChars(T: Private->getType()))
1784	.getQuantity();
1785	for (unsigned TySize = `4`; TySize > `0` && RealTySize > `0`; TySize /=`2`) {
1786	unsigned NumIters = RealTySize / TySize;
1787	if (NumIters == `0`)
1788	continue;
1789	QualType CType = C.getIntTypeForBitwidth(
1790	DestWidth: C.toBits(CharSize: CharUnits::fromQuantity(Quantity: TySize)), /Signed=/`1`);
1791	llvm::Type *CopyType = CGF.ConvertTypeForMem(T: CType);
1792	CharUnits Align = CharUnits::fromQuantity(Quantity: TySize);
1793	llvm::Value Cnt = nullptr*;
1794	Address CntAddr = Address::invalid();
1795	llvm::BasicBlock PrecondBB = nullptr*;
1796	llvm::BasicBlock ExitBB = nullptr*;
1797	if (NumIters > `1`) {
1798	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1799	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(Ty: CGM.IntTy), CntAddr,
1800	/Volatile=/false, C.IntTy);
1801	PrecondBB = CGF.createBasicBlock(name: "precond");
1802	ExitBB = CGF.createBasicBlock(name: "exit");
1803	llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "body");
1804	// There is no need to emit line number for unconditional branch.
1805	(void)ApplyDebugLocation::CreateEmpty(CGF);
1806	CGF.EmitBlock(BB: PrecondBB);
1807	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
1808	llvm::Value *Cmp =
1809	Bld.CreateICmpULT(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, V: NumIters));
1810	Bld.CreateCondBr(Cond: Cmp, True: BodyBB, False: ExitBB);
1811	CGF.EmitBlock(BB: BodyBB);
1812	}
1813	// kmpc_barrier.
1814	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1815	/EmitChecks=/false,
1816	/ForceSimpleCall=/true);
1817	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
1818	llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
1819	llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
1820
1821	// if (lane_id == 0)
1822	llvm::Value *IsWarpMaster = Bld.CreateIsNull(Arg: LaneID, Name: "warp_master");
1823	Bld.CreateCondBr(Cond: IsWarpMaster, True: ThenBB, False: ElseBB);
1824	CGF.EmitBlock(BB: ThenBB);
1825
1826	// Reduce element = LocalReduceList[i]
1827	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1828	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1829	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
1830	// elemptr = ((CopyType)(elemptrptr)) + I*
1831	Address ElemPtr(ElemPtrPtr, CopyType, Align);
1832	if (NumIters > `1`)
1833	ElemPtr = Bld.CreateGEP(Addr: ElemPtr, Index: Cnt);
1834
1835	// Get pointer to location in transfer medium.
1836	// MediumPtr = &medium[warp_id]
1837	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1838	Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1839	IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), WarpID});
1840	// Casting to actual data type.
1841	// MediumPtr = (CopyType)MediumPtrAddr;*
1842	Address MediumPtr(MediumPtrVal, CopyType, Align);
1843
1844	// elem = elemptr*
1845	//MediumPtr = elem*
1846	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1847	Addr: ElemPtr, /Volatile=/false, Ty: CType, Loc,
1848	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
1849	// Store the source element value to the dest element address.
1850	CGF.EmitStoreOfScalar(Value: Elem, Addr: MediumPtr, /Volatile=/true, Ty: CType,
1851	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
1852	TBAAInfo: TBAAAccessInfo ());
1853
1854	Bld.CreateBr(Dest: MergeBB);
1855
1856	CGF.EmitBlock(BB: ElseBB);
1857	Bld.CreateBr(Dest: MergeBB);
1858
1859	CGF.EmitBlock(BB: MergeBB);
1860
1861	// kmpc_barrier.
1862	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1863	/EmitChecks=/false,
1864	/ForceSimpleCall=/true);
1865
1866	//
1867	// Warp 0 copies reduce element from transfer medium.
1868	//
1869	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock(name: "then");
1870	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock(name: "else");
1871	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock(name: "ifcont");
1872
1873	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1874	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1875	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);
1876
1877	// Up to 32 threads in warp 0 are active.
1878	llvm::Value *IsActiveThread =
1879	Bld.CreateICmpULT(LHS: ThreadID, RHS: NumWarpsVal, Name: "is_active_thread");
1880	Bld.CreateCondBr(Cond: IsActiveThread, True: W0ThenBB, False: W0ElseBB);
1881
1882	CGF.EmitBlock(BB: W0ThenBB);
1883
1884	// SrcMediumPtr = &medium[tid]
1885	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1886	Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1887	IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), ThreadID});
1888	// SrcMediumVal = SrcMediumPtr;*
1889	Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1890
1891	// TargetElemPtr = (CopyType)(SrcDataAddr[i]) + I*
1892	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1893	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1894	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
1895	Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1896	if (NumIters > `1`)
1897	TargetElemPtr = Bld.CreateGEP(Addr: TargetElemPtr, Index: Cnt);
1898
1899	// TargetElemPtr = SrcMediumVal;*
1900	llvm::Value *SrcMediumValue =
1901	CGF.EmitLoadOfScalar(Addr: SrcMediumPtr, /Volatile=/true, Ty: CType, Loc);
1902	CGF.EmitStoreOfScalar(Value: SrcMediumValue, Addr: TargetElemPtr, /Volatile=/false,
1903	Ty: CType);
1904	Bld.CreateBr(Dest: W0MergeBB);
1905
1906	CGF.EmitBlock(BB: W0ElseBB);
1907	Bld.CreateBr(Dest: W0MergeBB);
1908
1909	CGF.EmitBlock(BB: W0MergeBB);
1910
1911	if (NumIters > `1`) {
1912	Cnt = Bld.CreateNSWAdd(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, /V=/`1`));
1913	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
1914	CGF.EmitBranch(Block: PrecondBB);
1915	(void)ApplyDebugLocation::CreateEmpty(CGF);
1916	CGF.EmitBlock(BB: ExitBB);
1917	}
1918	RealTySize %= TySize;
1919	}
1920	++Idx;
1921	}
1922
1923	CGF.FinishFunction();
1924	return Fn;
1925	}
1926
1927	/// Emit a helper that reduces data across two OpenMP threads (lanes)
1928	/// in the same warp. It uses shuffle instructions to copy over data from
1929	/// a remote lane's stack. The reduction algorithm performed is specified
1930	/// by the fourth parameter.
1931	///
1932	/// Algorithm Versions.
1933	/// Full Warp Reduce (argument value 0):
1934	/// This algorithm assumes that all 32 lanes are active and gathers
1935	/// data from these 32 lanes, producing a single resultant value.
1936	/// Contiguous Partial Warp Reduce (argument value 1):
1937	/// This algorithm assumes that only a contiguous* subset of lanes*
1938	/// are active. This happens for the last warp in a parallel region
1939	/// when the user specified num_threads is not an integer multiple of
1940	/// 32. This contiguous subset always starts with the zeroth lane.
1941	/// Partial Warp Reduce (argument value 2):
1942	/// This algorithm gathers data from any number of lanes at any position.
1943	/// All reduced values are stored in the lowest possible lane. The set
1944	/// of problems every algorithm addresses is a super set of those
1945	/// addressable by algorithms with a lower version number. Overhead
1946	/// increases as algorithm version increases.
1947	///
1948	/// Terminology
1949	/// Reduce element:
1950	/// Reduce element refers to the individual data field with primitive
1951	/// data types to be combined and reduced across threads.
1952	/// Reduce list:
1953	/// Reduce list refers to a collection of local, thread-private
1954	/// reduce elements.
1955	/// Remote Reduce list:
1956	/// Remote Reduce list refers to a collection of remote (relative to
1957	/// the current thread) reduce elements.
1958	///
1959	/// We distinguish between three states of threads that are important to
1960	/// the implementation of this function.
1961	/// Alive threads:
1962	/// Threads in a warp executing the SIMT instruction, as distinguished from
1963	/// threads that are inactive due to divergent control flow.
1964	/// Active threads:
1965	/// The minimal set of threads that has to be alive upon entry to this
1966	/// function. The computation is correct iff active threads are alive.
1967	/// Some threads are alive but they are not active because they do not
1968	/// contribute to the computation in any useful manner. Turning them off
1969	/// may introduce control flow overheads without any tangible benefits.
1970	/// Effective threads:
1971	/// In order to comply with the argument requirements of the shuffle
1972	/// function, we must keep all lanes holding data alive. But at most
1973	/// half of them perform value aggregation; we refer to this half of
1974	/// threads as effective. The other half is simply handing off their
1975	/// data.
1976	///
1977	/// Procedure
1978	/// Value shuffle:
1979	/// In this step active threads transfer data from higher lane positions
1980	/// in the warp to lower lane positions, creating Remote Reduce list.
1981	/// Value aggregation:
1982	/// In this step, effective threads combine their thread local Reduce list
1983	/// with Remote Reduce list and store the result in the thread local
1984	/// Reduce list.
1985	/// Value copy:
1986	/// In this step, we deal with the assumption made by algorithm 2
1987	/// (i.e. contiguity assumption). When we have an odd number of lanes
1988	/// active, say 2k+1, only k threads will be effective and therefore k
1989	/// new values will be produced. However, the Reduce list owned by the
1990	/// (2k+1)th thread is ignored in the value aggregation. Therefore
1991	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1992	/// that the contiguity assumption still holds.
1993	static llvm::Function *emitShuffleAndReduceFunction(
1994	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
1995	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
1996	ASTContext &C = CGM.getContext();
1997
1998	// Thread local Reduce list used to host the values of data to be reduced.
1999	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2000	C.VoidPtrTy, ImplicitParamKind::Other);
2001	// Current lane id; could be logical.
2002	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
2003	ImplicitParamKind::Other);
2004	// Offset of the remote source lane relative to the current lane.
2005	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2006	C.ShortTy, ImplicitParamKind::Other);
2007	// Algorithm version. This is expected to be known at compile time.
2008	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2009	C.ShortTy, ImplicitParamKind::Other);
2010	FunctionArgList Args;
2011	Args.push_back(&ReduceListArg);
2012	Args.push_back(&LaneIDArg);
2013	Args.push_back(&RemoteLaneOffsetArg);
2014	Args.push_back(&AlgoVerArg);
2015
2016	const CGFunctionInfo &CGFI =
2017	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2018	auto *Fn = llvm::Function::Create(
2019	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2020	N: "_omp_reduction_shuffle_and_reduce_func", M: &CGM.getModule());
2021	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2022	Fn->setDoesNotRecurse();
2023
2024	CodeGenFunction CGF(CGM);
2025	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2026
2027	CGBuilderTy &Bld = CGF.Builder;
2028
2029	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2030	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2031	Address LocalReduceList(
2032	Bld.CreatePointerBitCastOrAddrSpaceCast(
2033	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2034	C.VoidPtrTy, SourceLocation ()),
2035	ElemTy->getPointerTo()),
2036	ElemTy, CGF.getPointerAlign());
2037
2038	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2039	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2040	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2041
2042	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2043	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2044	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2045
2046	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2047	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2048	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation ());
2049
2050	// Create a local thread-private variable to host the Reduce list
2051	// from a remote lane.
2052	Address RemoteReduceList =
2053	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.remote_reduce_list");
2054
2055	// This loop iterates through the list of reduce elements and copies,
2056	// element by element, from a remote lane in the warp to RemoteReduceList,
2057	// hosted on the thread's stack.
2058	emitReductionListCopy(Action: RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2059	SrcBase: LocalReduceList, DestBase: RemoteReduceList,
2060	CopyOptions: {/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
2061	/ScratchpadIndex=/nullptr,
2062	/ScratchpadWidth=/nullptr});
2063
2064	// The actions to be performed on the Remote Reduce list is dependent
2065	// on the algorithm version.
2066	//
2067	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
2068	// LaneId % 2 == 0 && Offset > 0):
2069	// do the reduction value aggregation
2070	//
2071	// The thread local variable Reduce list is mutated in place to host the
2072	// reduced data, which is the aggregated value produced from local and
2073	// remote lanes.
2074	//
2075	// Note that AlgoVer is expected to be a constant integer known at compile
2076	// time.
2077	// When AlgoVer==0, the first conjunction evaluates to true, making
2078	// the entire predicate true during compile time.
2079	// When AlgoVer==1, the second conjunction has only the second part to be
2080	// evaluated during runtime. Other conjunctions evaluates to false
2081	// during compile time.
2082	// When AlgoVer==2, the third conjunction has only the second part to be
2083	// evaluated during runtime. Other conjunctions evaluates to false
2084	// during compile time.
2085	llvm::Value *CondAlgo0 = Bld.CreateIsNull(Arg: AlgoVerArgVal);
2086
2087	llvm::Value *Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `1`));
2088	llvm::Value *CondAlgo1 = Bld.CreateAnd(
2089	LHS: Algo1, RHS: Bld.CreateICmpULT(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2090
2091	llvm::Value *Algo2 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `2`));
2092	llvm::Value *CondAlgo2 = Bld.CreateAnd(
2093	LHS: Algo2, RHS: Bld.CreateIsNull(Arg: Bld.CreateAnd(LHS: LaneIDArgVal, RHS: Bld.getInt16(C: `1`))));
2094	CondAlgo2 = Bld.CreateAnd(
2095	LHS: CondAlgo2, RHS: Bld.CreateICmpSGT(LHS: RemoteLaneOffsetArgVal, RHS: Bld.getInt16(C: `0`)));
2096
2097	llvm::Value *CondReduce = Bld.CreateOr(LHS: CondAlgo0, RHS: CondAlgo1);
2098	CondReduce = Bld.CreateOr(LHS: CondReduce, RHS: CondAlgo2);
2099
2100	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
2101	llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
2102	llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
2103	Bld.CreateCondBr(Cond: CondReduce, True: ThenBB, False: ElseBB);
2104
2105	CGF.EmitBlock(BB: ThenBB);
2106	// reduce_function(LocalReduceList, RemoteReduceList)
2107	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2108	V: LocalReduceList.getPointer(), DestTy: CGF.VoidPtrTy);
2109	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2110	V: RemoteReduceList.getPointer(), DestTy: CGF.VoidPtrTy);
2111	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2112	CGF, Loc, OutlinedFn: ReduceFn, Args: {LocalReduceListPtr, RemoteReduceListPtr});
2113	Bld.CreateBr(Dest: MergeBB);
2114
2115	CGF.EmitBlock(BB: ElseBB);
2116	Bld.CreateBr(Dest: MergeBB);
2117
2118	CGF.EmitBlock(BB: MergeBB);
2119
2120	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2121	// Reduce list.
2122	Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: `1`));
2123	llvm::Value *CondCopy = Bld.CreateAnd(
2124	LHS: Algo1, RHS: Bld.CreateICmpUGE(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2125
2126	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock(name: "then");
2127	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock(name: "else");
2128	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock(name: "ifcont");
2129	Bld.CreateCondBr(Cond: CondCopy, True: CpyThenBB, False: CpyElseBB);
2130
2131	CGF.EmitBlock(BB: CpyThenBB);
2132	emitReductionListCopy(Action: ThreadCopy, CGF, ReductionArrayTy, Privates,
2133	SrcBase: RemoteReduceList, DestBase: LocalReduceList);
2134	Bld.CreateBr(Dest: CpyMergeBB);
2135
2136	CGF.EmitBlock(BB: CpyElseBB);
2137	Bld.CreateBr(Dest: CpyMergeBB);
2138
2139	CGF.EmitBlock(BB: CpyMergeBB);
2140
2141	CGF.FinishFunction();
2142	return Fn;
2143	}
2144
2145	/// This function emits a helper that copies all the reduction variables from
2146	/// the team into the provided global buffer for the reduction variables.
2147	///
2148	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2149	/// For all data entries D in reduce_data:
2150	/// Copy local D to buffer.D[Idx]
2151	static llvm::Value *emitListToGlobalCopyFunction(
2152	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2153	QualType ReductionArrayTy, SourceLocation Loc,
2154	const RecordDecl *TeamReductionRec,
2155	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2156	&VarFieldMap) {
2157	ASTContext &C = CGM.getContext();
2158
2159	// Buffer: global reduction buffer.
2160	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2161	C.VoidPtrTy, ImplicitParamKind::Other);
2162	// Idx: index of the buffer.
2163	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2164	ImplicitParamKind::Other);
2165	// ReduceList: thread local Reduce list.
2166	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2167	C.VoidPtrTy, ImplicitParamKind::Other);
2168	FunctionArgList Args;
2169	Args.push_back(&BufferArg);
2170	Args.push_back(&IdxArg);
2171	Args.push_back(&ReduceListArg);
2172
2173	const CGFunctionInfo &CGFI =
2174	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2175	auto *Fn = llvm::Function::Create(
2176	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2177	N: "_omp_reduction_list_to_global_copy_func", M: &CGM.getModule());
2178	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2179	Fn->setDoesNotRecurse();
2180	CodeGenFunction CGF(CGM);
2181	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2182
2183	CGBuilderTy &Bld = CGF.Builder;
2184
2185	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2186	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2187	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2188	Address LocalReduceList(
2189	Bld.CreatePointerBitCastOrAddrSpaceCast(
2190	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2191	C.VoidPtrTy, Loc),
2192	ElemTy->getPointerTo()),
2193	ElemTy, CGF.getPointerAlign());
2194	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2195	llvm::Type *LLVMReductionsBufferTy =
2196	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2197	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2198	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2199	LLVMReductionsBufferTy->getPointerTo());
2200	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2201	/Volatile=/false, C.IntTy,
2202	Loc)};
2203	unsigned Idx = `0`;
2204	for (const Expr *Private : Privates) {
2205	// Reduce element = LocalReduceList[i]
2206	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2207	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2208	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
2209	// elemptr = ((CopyType)(elemptrptr)) + I*
2210	ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2211	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2212	V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2213	Address ElemPtr =
2214	Address (ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2215	const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2216	// Global = Buffer.VD[Idx];
2217	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2218	llvm::Value *BufferPtr =
2219	Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2220	LValue GlobLVal = CGF.EmitLValueForField(
2221	Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2222	Address GlobAddr = GlobLVal.getAddress(CGF);
2223	GlobLVal.setAddress(Address (GlobAddr.getPointer(),
2224	CGF.ConvertTypeForMem(T: Private->getType()),
2225	GlobAddr.getAlignment()));
2226	switch (CGF.getEvaluationKind(T: Private->getType())) {
2227	case TEK_Scalar: {
2228	llvm::Value *V = CGF.EmitLoadOfScalar(
2229	Addr: ElemPtr, /Volatile=/false, Ty: Private->getType(), Loc,
2230	BaseInfo: LValueBaseInfo (AlignmentSource::Type), TBAAInfo: TBAAAccessInfo ());
2231	CGF.EmitStoreOfScalar(value: V, lvalue: GlobLVal);
2232	break;
2233	}
2234	case TEK_Complex: {
2235	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2236	src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()), loc: Loc);
2237	CGF.EmitStoreOfComplex(V, dest: GlobLVal, /isInit=/false);
2238	break;
2239	}
2240	case TEK_Aggregate:
2241	CGF.EmitAggregateCopy(Dest: GlobLVal,
2242	Src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2243	EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
2244	break;
2245	}
2246	++Idx;
2247	}
2248
2249	CGF.FinishFunction();
2250	return Fn;
2251	}
2252
2253	/// This function emits a helper that reduces all the reduction variables from
2254	/// the team into the provided global buffer for the reduction variables.
2255	///
2256	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
2257	/// void GlobPtrs[];*
2258	/// GlobPtrs[0] = (void)&buffer.D0[Idx];*
2259	/// ...
2260	/// GlobPtrs[N] = (void)&buffer.DN[Idx];*
2261	/// reduce_function(GlobPtrs, reduce_data);
2262	static llvm::Value *emitListToGlobalReduceFunction(
2263	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2264	QualType ReductionArrayTy, SourceLocation Loc,
2265	const RecordDecl *TeamReductionRec,
2266	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2267	&VarFieldMap,
2268	llvm::Function *ReduceFn) {
2269	ASTContext &C = CGM.getContext();
2270
2271	// Buffer: global reduction buffer.
2272	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2273	C.VoidPtrTy, ImplicitParamKind::Other);
2274	// Idx: index of the buffer.
2275	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2276	ImplicitParamKind::Other);
2277	// ReduceList: thread local Reduce list.
2278	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2279	C.VoidPtrTy, ImplicitParamKind::Other);
2280	FunctionArgList Args;
2281	Args.push_back(&BufferArg);
2282	Args.push_back(&IdxArg);
2283	Args.push_back(&ReduceListArg);
2284
2285	const CGFunctionInfo &CGFI =
2286	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2287	auto *Fn = llvm::Function::Create(
2288	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2289	N: "_omp_reduction_list_to_global_reduce_func", M: &CGM.getModule());
2290	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2291	Fn->setDoesNotRecurse();
2292	CodeGenFunction CGF(CGM);
2293	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2294
2295	CGBuilderTy &Bld = CGF.Builder;
2296
2297	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2298	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2299	llvm::Type *LLVMReductionsBufferTy =
2300	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2301	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2302	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2303	LLVMReductionsBufferTy->getPointerTo());
2304
2305	// 1. Build a list of reduction variables.
2306	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2307	Address ReductionList =
2308	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2309	auto IPriv = Privates.begin();
2310	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2311	/Volatile=/false, C.IntTy,
2312	Loc)};
2313	unsigned Idx = `0`;
2314	for (unsigned I = `0`, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2315	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2316	// Global = Buffer.VD[Idx];
2317	const ValueDecl VD = cast<DeclRefExpr>(Val: IPriv)->getDecl();
2318	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2319	llvm::Value *BufferPtr =
2320	Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2321	LValue GlobLVal = CGF.EmitLValueForField(
2322	Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2323	Address GlobAddr = GlobLVal.getAddress(CGF);
2324	CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /Volatile=/false,
2325	C.VoidPtrTy);
2326	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2327	// Store array size.
2328	++Idx;
2329	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2330	llvm::Value *Size = CGF.Builder.CreateIntCast(
2331	V: CGF.getVLASize(
2332	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2333	.NumElts,
2334	DestTy: CGF.SizeTy, /isSigned=/false);
2335	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2336	Addr: Elem);
2337	}
2338	}
2339
2340	// Call reduce_function(GlobalReduceList, ReduceList)
2341	llvm::Value *GlobalReduceList = ReductionList.getPointer();
2342	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2343	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2344	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2345	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2346	CGF, Loc, OutlinedFn: ReduceFn, Args: {GlobalReduceList, ReducedPtr});
2347	CGF.FinishFunction();
2348	return Fn;
2349	}
2350
2351	/// This function emits a helper that copies all the reduction variables from
2352	/// the team into the provided global buffer for the reduction variables.
2353	///
2354	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2355	/// For all data entries D in reduce_data:
2356	/// Copy buffer.D[Idx] to local D;
2357	static llvm::Value *emitGlobalToListCopyFunction(
2358	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2359	QualType ReductionArrayTy, SourceLocation Loc,
2360	const RecordDecl *TeamReductionRec,
2361	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2362	&VarFieldMap) {
2363	ASTContext &C = CGM.getContext();
2364
2365	// Buffer: global reduction buffer.
2366	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2367	C.VoidPtrTy, ImplicitParamKind::Other);
2368	// Idx: index of the buffer.
2369	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2370	ImplicitParamKind::Other);
2371	// ReduceList: thread local Reduce list.
2372	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2373	C.VoidPtrTy, ImplicitParamKind::Other);
2374	FunctionArgList Args;
2375	Args.push_back(&BufferArg);
2376	Args.push_back(&IdxArg);
2377	Args.push_back(&ReduceListArg);
2378
2379	const CGFunctionInfo &CGFI =
2380	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2381	auto *Fn = llvm::Function::Create(
2382	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2383	N: "_omp_reduction_global_to_list_copy_func", M: &CGM.getModule());
2384	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2385	Fn->setDoesNotRecurse();
2386	CodeGenFunction CGF(CGM);
2387	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2388
2389	CGBuilderTy &Bld = CGF.Builder;
2390
2391	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2392	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2393	llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2394	Address LocalReduceList(
2395	Bld.CreatePointerBitCastOrAddrSpaceCast(
2396	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2397	C.VoidPtrTy, Loc),
2398	ElemTy->getPointerTo()),
2399	ElemTy, CGF.getPointerAlign());
2400	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2401	llvm::Type *LLVMReductionsBufferTy =
2402	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2403	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2404	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2405	LLVMReductionsBufferTy->getPointerTo());
2406
2407	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2408	/Volatile=/false, C.IntTy,
2409	Loc)};
2410	unsigned Idx = `0`;
2411	for (const Expr *Private : Privates) {
2412	// Reduce element = LocalReduceList[i]
2413	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2414	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2415	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation ());
2416	// elemptr = ((CopyType)(elemptrptr)) + I*
2417	ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2418	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2419	V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2420	Address ElemPtr =
2421	Address (ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2422	const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2423	// Global = Buffer.VD[Idx];
2424	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2425	llvm::Value *BufferPtr =
2426	Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2427	LValue GlobLVal = CGF.EmitLValueForField(
2428	Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2429	Address GlobAddr = GlobLVal.getAddress(CGF);
2430	GlobLVal.setAddress(Address (GlobAddr.getPointer(),
2431	CGF.ConvertTypeForMem(T: Private->getType()),
2432	GlobAddr.getAlignment()));
2433	switch (CGF.getEvaluationKind(T: Private->getType())) {
2434	case TEK_Scalar: {
2435	llvm::Value *V = CGF.EmitLoadOfScalar(lvalue: GlobLVal, Loc);
2436	CGF.EmitStoreOfScalar(Value: V, Addr: ElemPtr, /Volatile=/false, Ty: Private->getType(),
2437	BaseInfo: LValueBaseInfo (AlignmentSource::Type),
2438	TBAAInfo: TBAAAccessInfo ());
2439	break;
2440	}
2441	case TEK_Complex: {
2442	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(src: GlobLVal, loc: Loc);
2443	CGF.EmitStoreOfComplex(V, dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2444	/isInit=/false);
2445	break;
2446	}
2447	case TEK_Aggregate:
2448	CGF.EmitAggregateCopy(Dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2449	Src: GlobLVal, EltTy: Private->getType(),
2450	MayOverlap: AggValueSlot::DoesNotOverlap);
2451	break;
2452	}
2453	++Idx;
2454	}
2455
2456	CGF.FinishFunction();
2457	return Fn;
2458	}
2459
2460	/// This function emits a helper that reduces all the reduction variables from
2461	/// the team into the provided global buffer for the reduction variables.
2462	///
2463	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
2464	/// void GlobPtrs[];*
2465	/// GlobPtrs[0] = (void)&buffer.D0[Idx];*
2466	/// ...
2467	/// GlobPtrs[N] = (void)&buffer.DN[Idx];*
2468	/// reduce_function(reduce_data, GlobPtrs);
2469	static llvm::Value *emitGlobalToListReduceFunction(
2470	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2471	QualType ReductionArrayTy, SourceLocation Loc,
2472	const RecordDecl *TeamReductionRec,
2473	const llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *>
2474	&VarFieldMap,
2475	llvm::Function *ReduceFn) {
2476	ASTContext &C = CGM.getContext();
2477
2478	// Buffer: global reduction buffer.
2479	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2480	C.VoidPtrTy, ImplicitParamKind::Other);
2481	// Idx: index of the buffer.
2482	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2483	ImplicitParamKind::Other);
2484	// ReduceList: thread local Reduce list.
2485	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2486	C.VoidPtrTy, ImplicitParamKind::Other);
2487	FunctionArgList Args;
2488	Args.push_back(&BufferArg);
2489	Args.push_back(&IdxArg);
2490	Args.push_back(&ReduceListArg);
2491
2492	const CGFunctionInfo &CGFI =
2493	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2494	auto *Fn = llvm::Function::Create(
2495	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2496	N: "_omp_reduction_global_to_list_reduce_func", M: &CGM.getModule());
2497	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
2498	Fn->setDoesNotRecurse();
2499	CodeGenFunction CGF(CGM);
2500	CGF.StartFunction(GD: GlobalDecl (), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2501
2502	CGBuilderTy &Bld = CGF.Builder;
2503
2504	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2505	QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2506	llvm::Type *LLVMReductionsBufferTy =
2507	CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2508	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2509	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2510	LLVMReductionsBufferTy->getPointerTo());
2511
2512	// 1. Build a list of reduction variables.
2513	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2514	Address ReductionList =
2515	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2516	auto IPriv = Privates.begin();
2517	llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2518	/Volatile=/false, C.IntTy,
2519	Loc)};
2520	unsigned Idx = `0`;
2521	for (unsigned I = `0`, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2522	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2523	// Global = Buffer.VD[Idx];
2524	const ValueDecl VD = cast<DeclRefExpr>(Val: IPriv)->getDecl();
2525	const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2526	llvm::Value *BufferPtr =
2527	Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2528	LValue GlobLVal = CGF.EmitLValueForField(
2529	Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2530	Address GlobAddr = GlobLVal.getAddress(CGF);
2531	CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /Volatile=/false,
2532	C.VoidPtrTy);
2533	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2534	// Store array size.
2535	++Idx;
2536	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2537	llvm::Value *Size = CGF.Builder.CreateIntCast(
2538	V: CGF.getVLASize(
2539	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2540	.NumElts,
2541	DestTy: CGF.SizeTy, /isSigned=/false);
2542	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2543	Addr: Elem);
2544	}
2545	}
2546
2547	// Call reduce_function(ReduceList, GlobalReduceList)
2548	llvm::Value *GlobalReduceList = ReductionList.getPointer();
2549	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2550	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2551	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2552	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2553	CGF, Loc, OutlinedFn: ReduceFn, Args: {ReducedPtr, GlobalReduceList});
2554	CGF.FinishFunction();
2555	return Fn;
2556	}
2557
2558	///
2559	/// Design of OpenMP reductions on the GPU
2560	///
2561	/// Consider a typical OpenMP program with one or more reduction
2562	/// clauses:
2563	///
2564	/// float foo;
2565	/// double bar;
2566	/// #pragma omp target teams distribute parallel for \
2567	/// reduction(+:foo) reduction(:bar)*
2568	/// for (int i = 0; i < N; i++) {
2569	/// foo += A[i]; bar = B[i];*
2570	/// }
2571	///
2572	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2573	/// all teams. In our OpenMP implementation on the NVPTX device an
2574	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2575	/// within a team are mapped to CUDA threads within a threadblock.
2576	/// Our goal is to efficiently aggregate values across all OpenMP
2577	/// threads such that:
2578	///
2579	/// - the compiler and runtime are logically concise, and
2580	/// - the reduction is performed efficiently in a hierarchical
2581	/// manner as follows: within OpenMP threads in the same warp,
2582	/// across warps in a threadblock, and finally across teams on
2583	/// the NVPTX device.
2584	///
2585	/// Introduction to Decoupling
2586	///
2587	/// We would like to decouple the compiler and the runtime so that the
2588	/// latter is ignorant of the reduction variables (number, data types)
2589	/// and the reduction operators. This allows a simpler interface
2590	/// and implementation while still attaining good performance.
2591	///
2592	/// Pseudocode for the aforementioned OpenMP program generated by the
2593	/// compiler is as follows:
2594	///
2595	/// 1. Create private copies of reduction variables on each OpenMP
2596	/// thread: 'foo_private', 'bar_private'
2597	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2598	/// to it and writes the result in 'foo_private' and 'bar_private'
2599	/// respectively.
2600	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2601	/// and store the result on the team master:
2602	///
2603	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2604	/// reduceData, shuffleReduceFn, interWarpCpyFn)
2605	///
2606	/// where:
2607	/// struct ReduceData {
2608	/// double foo;*
2609	/// double bar;*
2610	/// } reduceData
2611	/// reduceData.foo = &foo_private
2612	/// reduceData.bar = &bar_private
2613	///
2614	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2615	/// auxiliary functions generated by the compiler that operate on
2616	/// variables of type 'ReduceData'. They aid the runtime perform
2617	/// algorithmic steps in a data agnostic manner.
2618	///
2619	/// 'shuffleReduceFn' is a pointer to a function that reduces data
2620	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2621	/// same warp. It takes the following arguments as input:
2622	///
2623	/// a. variable of type 'ReduceData' on the calling lane,
2624	/// b. its lane_id,
2625	/// c. an offset relative to the current lane_id to generate a
2626	/// remote_lane_id. The remote lane contains the second
2627	/// variable of type 'ReduceData' that is to be reduced.
2628	/// d. an algorithm version parameter determining which reduction
2629	/// algorithm to use.
2630	///
2631	/// 'shuffleReduceFn' retrieves data from the remote lane using
2632	/// efficient GPU shuffle intrinsics and reduces, using the
2633	/// algorithm specified by the 4th parameter, the two operands
2634	/// element-wise. The result is written to the first operand.
2635	///
2636	/// Different reduction algorithms are implemented in different
2637	/// runtime functions, all calling 'shuffleReduceFn' to perform
2638	/// the essential reduction step. Therefore, based on the 4th
2639	/// parameter, this function behaves slightly differently to
2640	/// cooperate with the runtime to ensure correctness under
2641	/// different circumstances.
2642	///
2643	/// 'InterWarpCpyFn' is a pointer to a function that transfers
2644	/// reduced variables across warps. It tunnels, through CUDA
2645	/// shared memory, the thread-private data of type 'ReduceData'
2646	/// from lane 0 of each warp to a lane in the first warp.
2647	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2648	/// The last team writes the global reduced value to memory.
2649	///
2650	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2651	/// reduceData, shuffleReduceFn, interWarpCpyFn,
2652	/// scratchpadCopyFn, loadAndReduceFn)
2653	///
2654	/// 'scratchpadCopyFn' is a helper that stores reduced
2655	/// data from the team master to a scratchpad array in
2656	/// global memory.
2657	///
2658	/// 'loadAndReduceFn' is a helper that loads data from
2659	/// the scratchpad array and reduces it with the input
2660	/// operand.
2661	///
2662	/// These compiler generated functions hide address
2663	/// calculation and alignment information from the runtime.
2664	/// 5. if ret == 1:
2665	/// The team master of the last team stores the reduced
2666	/// result to the globals in memory.
2667	/// foo += reduceData.foo; bar = reduceData.bar*
2668	///
2669	///
2670	/// Warp Reduction Algorithms
2671	///
2672	/// On the warp level, we have three algorithms implemented in the
2673	/// OpenMP runtime depending on the number of active lanes:
2674	///
2675	/// Full Warp Reduction
2676	///
2677	/// The reduce algorithm within a warp where all lanes are active
2678	/// is implemented in the runtime as follows:
2679	///
2680	/// full_warp_reduce(void reduce_data,*
2681	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2682	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2683	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2684	/// }
2685	///
2686	/// The algorithm completes in log(2, WARPSIZE) steps.
2687	///
2688	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2689	/// not used therefore we save instructions by not retrieving lane_id
2690	/// from the corresponding special registers. The 4th parameter, which
2691	/// represents the version of the algorithm being used, is set to 0 to
2692	/// signify full warp reduction.
2693	///
2694	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2695	///
2696	/// #reduce_elem refers to an element in the local lane's data structure
2697	/// #remote_elem is retrieved from a remote lane
2698	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2699	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2700	///
2701	/// Contiguous Partial Warp Reduction
2702	///
2703	/// This reduce algorithm is used within a warp where only the first
2704	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2705	/// number of OpenMP threads in a parallel region is not a multiple of
2706	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2707	///
2708	/// void
2709	/// contiguous_partial_reduce(void reduce_data,*
2710	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2711	/// int size, int lane_id) {
2712	/// int curr_size;
2713	/// int offset;
2714	/// curr_size = size;
2715	/// mask = curr_size/2;
2716	/// while (offset>0) {
2717	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2718	/// curr_size = (curr_size+1)/2;
2719	/// offset = curr_size/2;
2720	/// }
2721	/// }
2722	///
2723	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2724	///
2725	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2726	/// if (lane_id < offset)
2727	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2728	/// else
2729	/// reduce_elem = remote_elem
2730	///
2731	/// This algorithm assumes that the data to be reduced are located in a
2732	/// contiguous subset of lanes starting from the first. When there is
2733	/// an odd number of active lanes, the data in the last lane is not
2734	/// aggregated with any other lane's dat but is instead copied over.
2735	///
2736	/// Dispersed Partial Warp Reduction
2737	///
2738	/// This algorithm is used within a warp when any discontiguous subset of
2739	/// lanes are active. It is used to implement the reduction operation
2740	/// across lanes in an OpenMP simd region or in a nested parallel region.
2741	///
2742	/// void
2743	/// dispersed_partial_reduce(void reduce_data,*
2744	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2745	/// int size, remote_id;
2746	/// int logical_lane_id = number_of_active_lanes_before_me() 2;*
2747	/// do {
2748	/// remote_id = next_active_lane_id_right_after_me();
2749	/// # the above function returns 0 of no active lane
2750	/// # is present right after the current lane.
2751	/// size = number_of_active_lanes_in_this_warp();
2752	/// logical_lane_id /= 2;
2753	/// ShuffleReduceFn(reduce_data, logical_lane_id,
2754	/// remote_id-1-threadIdx.x, 2);
2755	/// } while (logical_lane_id % 2 == 0 && size > 1);
2756	/// }
2757	///
2758	/// There is no assumption made about the initial state of the reduction.
2759	/// Any number of lanes (>=1) could be active at any position. The reduction
2760	/// result is returned in the first active lane.
2761	///
2762	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2763	///
2764	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2765	/// if (lane_id % 2 == 0 && offset > 0)
2766	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2767	/// else
2768	/// reduce_elem = remote_elem
2769	///
2770	///
2771	/// Intra-Team Reduction
2772	///
2773	/// This function, as implemented in the runtime call
2774	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2775	/// threads in a team. It first reduces within a warp using the
2776	/// aforementioned algorithms. We then proceed to gather all such
2777	/// reduced values at the first warp.
2778	///
2779	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2780	/// data from each of the "warp master" (zeroth lane of each warp, where
2781	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2782	/// a mathematical sense) the problem of reduction across warp masters in
2783	/// a block to the problem of warp reduction.
2784	///
2785	///
2786	/// Inter-Team Reduction
2787	///
2788	/// Once a team has reduced its data to a single value, it is stored in
2789	/// a global scratchpad array. Since each team has a distinct slot, this
2790	/// can be done without locking.
2791	///
2792	/// The last team to write to the scratchpad array proceeds to reduce the
2793	/// scratchpad array. One or more workers in the last team use the helper
2794	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2795	/// the k'th worker reduces every k'th element.
2796	///
2797	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2798	/// reduce across workers and compute a globally reduced value.
2799	///
2800	void CGOpenMPRuntimeGPU::emitReduction(
2801	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2802	ArrayRef<const Expr > LHSExprs, ArrayRef<const* Expr *> RHSExprs,
2803	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2804	if (!CGF.HaveInsertPoint())
2805	return;
2806
2807	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2808	#ifndef NDEBUG
2809	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2810	#endif
2811
2812	if (Options.SimpleReduction) {
2813	assert(!TeamsReduction && !ParallelReduction &&
2814	"Invalid reduction selection in emitReduction.");
2815	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2816	ReductionOps, Options);
2817	return;
2818	}
2819
2820	assert((TeamsReduction \|\| ParallelReduction) &&
2821	"Invalid reduction selection in emitReduction.");
2822
2823	llvm::SmallDenseMap<const ValueDecl , const* FieldDecl *> VarFieldMap;
2824	llvm::SmallVector<const ValueDecl *, `4`> PrivatesReductions(Privates.size());
2825	int Cnt = `0`;
2826	for (const Expr *DRE : Privates) {
2827	PrivatesReductions [Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
2828	++Cnt;
2829	}
2830
2831	ASTContext &C = CGM.getContext();
2832	const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2833	C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: `1`);
2834
2835	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2836	// RedList, shuffle_reduce_func, interwarp_copy_func);
2837	// or
2838	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2839	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2840
2841	llvm::Value *Res;
2842	// 1. Build a list of reduction variables.
2843	// void RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};*
2844	auto Size = RHSExprs.size();
2845	for (const Expr *E : Privates) {
2846	if (E->getType()->isVariablyModifiedType())
2847	// Reserve place for array size.
2848	++Size;
2849	}
2850	llvm::APInt ArraySize(/unsigned int numBits=/`32`, Size);
2851	QualType ReductionArrayTy = C.getConstantArrayType(
2852	EltTy: C.VoidPtrTy, ArySize: ArraySize, SizeExpr: nullptr, ASM: ArraySizeModifier::Normal,
2853	/IndexTypeQuals=/`0`);
2854	Address ReductionList =
2855	CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2856	auto IPriv = Privates.begin();
2857	unsigned Idx = `0`;
2858	for (unsigned I = `0`, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2859	Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2860	CGF.Builder.CreateStore(
2861	Val: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2862	V: CGF.EmitLValue(E: RHSExprs [I]).getPointer(CGF), DestTy: CGF.VoidPtrTy),
2863	Addr: Elem);
2864	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2865	// Store array size.
2866	++Idx;
2867	Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2868	llvm::Value *Size = CGF.Builder.CreateIntCast(
2869	V: CGF.getVLASize(
2870	vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2871	.NumElts,
2872	DestTy: CGF.SizeTy, /isSigned=/false);
2873	CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2874	Addr: Elem);
2875	}
2876	}
2877
2878	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2879	V: ReductionList.getPointer(), DestTy: CGF.VoidPtrTy);
2880	llvm::Function *ReductionFn = emitReductionFunction(
2881	CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(T: ReductionArrayTy),
2882	Privates, LHSExprs, RHSExprs, ReductionOps);
2883	llvm::Value *ReductionDataSize =
2884	CGF.getTypeSize(Ty: C.getRecordType(Decl: ReductionRec));
2885	ReductionDataSize =
2886	CGF.Builder.CreateSExtOrTrunc(V: ReductionDataSize, DestTy: CGF.Int64Ty);
2887	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2888	CGM, Privates, ReductionArrayTy, ReduceFn: ReductionFn, Loc);
2889	llvm::Value *InterWarpCopyFn =
2890	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2891
2892	if (ParallelReduction) {
2893	llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2894	InterWarpCopyFn};
2895
2896	Res = CGF.EmitRuntimeCall(
2897	callee: OMPBuilder.getOrCreateRuntimeFunction(
2898	M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2899	args: Args);
2900	} else {
2901	assert(TeamsReduction && "expected teams reduction.");
2902	TeamsReductions.push_back(Elt: ReductionRec);
2903	auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2904	callee: OMPBuilder.getOrCreateRuntimeFunction(
2905	M&: CGM.getModule(), FnID: OMPRTL___kmpc_reduction_get_fixed_buffer),
2906	args: {}, name: "_openmp_teams_reductions_buffer_$_$ptr");
2907	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2908	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2909	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2910	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2911	ReduceFn: ReductionFn);
2912	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2913	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2914	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2915	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2916	ReduceFn: ReductionFn);
2917
2918	llvm::Value *Args[] = {
2919	RTLoc,
2920	KernelTeamsReductionPtr,
2921	CGF.Builder.getInt32(C: C.getLangOpts().OpenMPCUDAReductionBufNum),
2922	ReductionDataSize,
2923	RL,
2924	ShuffleAndReduceFn,
2925	InterWarpCopyFn,
2926	GlobalToBufferCpyFn,
2927	GlobalToBufferRedFn,
2928	BufferToGlobalCpyFn,
2929	BufferToGlobalRedFn};
2930
2931	Res = CGF.EmitRuntimeCall(
2932	callee: OMPBuilder.getOrCreateRuntimeFunction(
2933	M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2934	args: Args);
2935	}
2936
2937	// 5. Build if (res == 1)
2938	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".omp.reduction.done");
2939	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".omp.reduction.then");
2940	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2941	LHS: Res, RHS: llvm::ConstantInt::get(Ty: CGM.Int32Ty, /V=/`1`));
2942	CGF.Builder.CreateCondBr(Cond, True: ThenBB, False: ExitBB);
2943
2944	// 6. Build then branch: where we have reduced values in the master
2945	// thread in each team.
2946	// __kmpc_end_reduce{_nowait}(<gtid>);
2947	// break;
2948	CGF.EmitBlock(BB: ThenBB);
2949
2950	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2951	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2952	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2953	auto IPriv = Privates.begin();
2954	auto ILHS = LHSExprs.begin();
2955	auto IRHS = RHSExprs.begin();
2956	for (const Expr *E : ReductionOps) {
2957	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(Val: ILHS),
2958	cast<DeclRefExpr>(Val: *IRHS));
2959	++IPriv;
2960	++ILHS;
2961	++IRHS;
2962	}
2963	};
2964	RegionCodeGenTy RCG(CodeGen);
2965	RCG (CGF);
2966	// There is no need to emit line number for unconditional branch.
2967	(void)ApplyDebugLocation::CreateEmpty(CGF);
2968	CGF.EmitBlock(BB: ExitBB, /IsFinished=/true);
2969	}
2970
2971	const VarDecl *
2972	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
2973	const VarDecl NativeParam) const* {
2974	if (!NativeParam->getType()->isReferenceType())
2975	return NativeParam;
2976	QualType ArgType = NativeParam->getType();
2977	QualifierCollector QC;
2978	const Type *NonQualTy = QC.strip(type: ArgType);
2979	QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
2980	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2981	if (Attr->getCaptureKind() == OMPC_map) {
2982	PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
2983	AddressSpace: LangAS::opencl_global);
2984	}
2985	}
2986	ArgType = CGM.getContext().getPointerType(T: PointeeTy);
2987	QC.addRestrict();
2988	enum { NVPTX_local_addr = `5` };
2989	QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
2990	ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
2991	if (isa<ImplicitParamDecl>(Val: NativeParam))
2992	return ImplicitParamDecl::Create(
2993	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
2994	NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
2995	return ParmVarDecl::Create(
2996	C&: CGM.getContext(),
2997	DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
2998	StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
2999	Id: NativeParam->getIdentifier(), T: ArgType,
3000	/TInfo=/nullptr, S: SC_None, /DefArg=/nullptr);
3001	}
3002
3003	Address
3004	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3005	const VarDecl *NativeParam,
3006	const VarDecl TargetParam) const* {
3007	assert(NativeParam != TargetParam &&
3008	NativeParam->getType()->isReferenceType() &&
3009	"Native arg must not be the same as target arg.");
3010	Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
3011	QualType NativeParamType = NativeParam->getType();
3012	QualifierCollector QC;
3013	const Type *NonQualTy = QC.strip(type: NativeParamType);
3014	QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
3015	unsigned NativePointeeAddrSpace =
3016	CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
3017	QualType TargetTy = TargetParam->getType();
3018	llvm::Value TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /Volatile=/*false,
3019	Ty: TargetTy, Loc: SourceLocation ());
3020	// Cast to native address space.
3021	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3022	V: TargetAddr,
3023	DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
3024	Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
3025	CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /Volatile=/false,
3026	Ty: NativeParamType);
3027	return NativeParamAddr;
3028	}
3029
3030	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3031	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3032	ArrayRef<llvm::Value > Args) const* {
3033	SmallVector<llvm::Value *, `4`> TargetArgs;
3034	TargetArgs.reserve(N: Args.size());
3035	auto *FnType = OutlinedFn.getFunctionType();
3036	for (unsigned I = `0`, E = Args.size(); I < E; ++I) {
3037	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3038	TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
3039	break;
3040	}
3041	llvm::Type *TargetType = FnType->getParamType(i: I);
3042	llvm::Value *NativeArg = Args [I];
3043	if (!TargetType->isPointerTy()) {
3044	TargetArgs.emplace_back(Args&: NativeArg);
3045	continue;
3046	}
3047	TargetArgs.emplace_back(
3048	Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
3049	}
3050	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3051	}
3052
3053	/// Emit function which wraps the outline parallel region
3054	/// and controls the arguments which are passed to this function.
3055	/// The wrapper ensures that the outlined function is called
3056	/// with the correct arguments when data is shared.
3057	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3058	llvm::Function OutlinedParallelFn, const* OMPExecutableDirective &D) {
3059	ASTContext &Ctx = CGM.getContext();
3060	const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3061
3062	// Create a function that takes as argument the source thread.
3063	FunctionArgList WrapperArgs;
3064	QualType Int16QTy =
3065	Ctx.getIntTypeForBitwidth(/DestWidth=/`16`, /Signed=/false);
3066	QualType Int32QTy =
3067	Ctx.getIntTypeForBitwidth(/DestWidth=/`32`, /Signed=/false);
3068	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3069	/Id=/nullptr, Int16QTy,
3070	ImplicitParamKind::Other);
3071	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3072	/Id=/nullptr, Int32QTy,
3073	ImplicitParamKind::Other);
3074	WrapperArgs.emplace_back(Args: &ParallelLevelArg);
3075	WrapperArgs.emplace_back(Args: &WrapperArg);
3076
3077	const CGFunctionInfo &CGFI =
3078	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3079
3080	auto *Fn = llvm::Function::Create(
3081	Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
3082	N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
3083
3084	// Ensure we do not inline the function. This is trivially true for the ones
3085	// passed to __kmpc_fork_call but the ones calles in serialized regions
3086	// could be inlined. This is not a perfect but it is closer to the invariant
3087	// we want, namely, every data environment starts with a new function.
3088	// TODO: We should pass the if condition to the runtime function and do the
3089	// handling there. Much cleaner code.
3090	Fn->addFnAttr(llvm::Attribute::NoInline);
3091
3092	CGM.SetInternalFunctionAttributes(GD: GlobalDecl (), F: Fn, FI: CGFI);
3093	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3094	Fn->setDoesNotRecurse();
3095
3096	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
3097	CGF.StartFunction(GD: GlobalDecl (), RetTy: Ctx.VoidTy, Fn: Fn, FnInfo: CGFI, Args: WrapperArgs,
3098	Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
3099
3100	const auto *RD = CS.getCapturedRecordDecl();
3101	auto CurField = RD->field_begin();
3102
3103	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
3104	/Name=/".zero.addr");
3105	CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/C/ `0`), Addr: ZeroAddr);
3106	// Get the array of arguments.
3107	SmallVector<llvm::Value *, `8`> Args;
3108
3109	Args.emplace_back(Args: CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3110	Args.emplace_back(Args: ZeroAddr.getPointer());
3111
3112	CGBuilderTy &Bld = CGF.Builder;
3113	auto CI = CS.capture_begin();
3114
3115	// Use global memory for data sharing.
3116	// Handle passing of global args to workers.
3117	Address GlobalArgs =
3118	CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
3119	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3120	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3121	CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3122	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
3123	args: DataSharingArgs);
3124
3125	// Retrieve the shared variables from the list of references returned
3126	// by the runtime. Pass the variables to the outlined function.
3127	Address SharedArgListAddress = Address::invalid();
3128	if (CS.capture_size() > `0` \|\|
3129	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3130	SharedArgListAddress = CGF.EmitLoadOfPointer(
3131	Ptr: GlobalArgs, PtrTy: CGF.getContext()
3132	.getPointerType(CGF.getContext().VoidPtrTy)
3133	.castAs<PointerType>());
3134	}
3135	unsigned Idx = `0`;
3136	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3137	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3138	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3139	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3140	llvm::Value *LB = CGF.EmitLoadOfScalar(
3141	Addr: TypedAddress,
3142	/Volatile=/false,
3143	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3144	Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
3145	Args.emplace_back(Args&: LB);
3146	++Idx;
3147	Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3148	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3149	Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3150	llvm::Value *UB = CGF.EmitLoadOfScalar(
3151	Addr: TypedAddress,
3152	/Volatile=/false,
3153	Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3154	Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
3155	Args.emplace_back(Args&: UB);
3156	++Idx;
3157	}
3158	if (CS.capture_size() > `0`) {
3159	ASTContext &CGFContext = CGF.getContext();
3160	for (unsigned I = `0`, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3161	QualType ElemTy = CurField->getType();
3162	Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
3163	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3164	Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
3165	ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
3166	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3167	/Volatile=/false,
3168	CGFContext.getPointerType(T: ElemTy),
3169	CI->getLocation());
3170	if (CI->capturesVariableByCopy() &&
3171	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3172	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3173	CI->getLocation());
3174	}
3175	Args.emplace_back(Args&: Arg);
3176	}
3177	}
3178
3179	emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
3180	CGF.FinishFunction();
3181	return Fn;
3182	}
3183
3184	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3185	const Decl *D) {
3186	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3187	return;
3188
3189	assert(D && "Expected function or captured\|block decl.");
3190	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == `0` &&
3191	"Function is registered already.");
3192	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&
3193	"Team is set but not processed.");
3194	const Stmt Body = nullptr*;
3195	bool NeedToDelayGlobalization = false;
3196	if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
3197	Body = FD->getBody();
3198	} else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
3199	Body = BD->getBody();
3200	} else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
3201	Body = CD->getBody();
3202	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3203	if (NeedToDelayGlobalization &&
3204	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3205	return;
3206	}
3207	if (!Body)
3208	return;
3209	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3210	VarChecker.Visit(Body);
3211	const RecordDecl *GlobalizedVarsRecord =
3212	VarChecker.getGlobalizedRecord(IsInTTDRegion);
3213	TeamAndReductions.first = nullptr;
3214	TeamAndReductions.second.clear();
3215	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3216	VarChecker.getEscapedVariableLengthDecls();
3217	ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3218	VarChecker.getDelayedVariableLengthDecls();
3219	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3220	DelayedVariableLengthDecls.empty())
3221	return;
3222	auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
3223	I ->getSecond().MappedParams =
3224	std::make_unique<CodeGenFunction::OMPMapVars>();
3225	I ->getSecond().EscapedParameters.insert(
3226	I: VarChecker.getEscapedParameters().begin(),
3227	E: VarChecker.getEscapedParameters().end());
3228	I ->getSecond().EscapedVariableLengthDecls.append(
3229	in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
3230	I ->getSecond().DelayedVariableLengthDecls.append(
3231	in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
3232	DeclToAddrMapTy &Data = I ->getSecond().LocalVarData;
3233	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3234	assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3235	Data.insert(std::make_pair(x&: VD, y: MappedVarData ()));
3236	}
3237	if (!NeedToDelayGlobalization) {
3238	emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
3239	struct GlobalizationScope final : EHScopeStack::Cleanup {
3240	GlobalizationScope() = default;
3241
3242	void Emit(CodeGenFunction &CGF, Flags flags) override {
3243	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3244	.emitGenericVarsEpilog(CGF);
3245	}
3246	};
3247	CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
3248	}
3249	}
3250
3251	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3252	const VarDecl *VD) {
3253	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3254	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3255	auto AS = LangAS::Default;
3256	switch (A->getAllocatorType()) {
3257	// Use the default allocator here as by default local vars are
3258	// threadlocal.
3259	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3260	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3261	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3262	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3263	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3264	// Follow the user decision - use default allocation.
3265	return Address::invalid();
3266	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3267	// TODO: implement aupport for user-defined allocators.
3268	return Address::invalid();
3269	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3270	AS = LangAS::cuda_constant;
3271	break;
3272	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3273	AS = LangAS::cuda_shared;
3274	break;
3275	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3276	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3277	break;
3278	}
3279	llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
3280	auto GV = new* llvm::GlobalVariable(
3281	CGM.getModule(), VarTy, /isConstant=/false,
3282	llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
3283	VD->getName(),
3284	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
3285	CGM.getContext().getTargetAddressSpace(AS));
3286	CharUnits Align = CGM.getContext().getDeclAlign(VD);
3287	GV->setAlignment(Align.getAsAlign());
3288	return Address(
3289	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3290	GV, VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
3291	AS: VD->getType().getAddressSpace()))),
3292	VarTy, Align);
3293	}
3294
3295	if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3296	return Address::invalid();
3297
3298	VD = VD->getCanonicalDecl();
3299	auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
3300	if (I == FunctionGlobalizedDecls.end())
3301	return Address::invalid();
3302	auto VDI = I ->getSecond().LocalVarData.find(VD);
3303	if (VDI != I ->getSecond().LocalVarData.end())
3304	return VDI->second.PrivateAddr;
3305	if (VD->hasAttrs()) {
3306	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3307	E(VD->attr_end());
3308	IT != E; ++IT) {
3309	auto VDI = I->getSecond().LocalVarData.find(
3310	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3311	->getCanonicalDecl());
3312	if (VDI != I->getSecond().LocalVarData.end())
3313	return VDI->second.PrivateAddr;
3314	}
3315	}
3316
3317	return Address::invalid();
3318	}
3319
3320	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3321	FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
3322	CGOpenMPRuntime::functionFinished(CGF);
3323	}
3324
3325	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3326	CodeGenFunction &CGF, const OMPLoopDirective &S,
3327	OpenMPDistScheduleClauseKind &ScheduleKind,
3328	llvm::Value &Chunk) const* {
3329	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3330	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3331	ScheduleKind = OMPC_DIST_SCHEDULE_static;
3332	Chunk = CGF.EmitScalarConversion(
3333	Src: RT.getGPUNumThreads(CGF),
3334	SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
3335	DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
3336	return;
3337	}
3338	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3339	CGF, S, ScheduleKind, Chunk);
3340	}
3341
3342	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3343	CodeGenFunction &CGF, const OMPLoopDirective &S,
3344	OpenMPScheduleClauseKind &ScheduleKind,
3345	const Expr &ChunkExpr) const* {
3346	ScheduleKind = OMPC_SCHEDULE_static;
3347	// Chunk size is 1 in this case.
3348	llvm::APInt ChunkSize(`32`, `1`);
3349	ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
3350	type: CGF.getContext().getIntTypeForBitwidth(DestWidth: `32`, /Signed=/`0`),
3351	l: SourceLocation ());
3352	}
3353
3354	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3355	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3356	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
3357	" Expected target-based directive.");
3358	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3359	for (const CapturedStmt::Capture &C : CS->captures()) {
3360	// Capture variables captured by reference in lambdas for target-based
3361	// directives.
3362	if (!C.capturesVariable())
3363	continue;
3364	const VarDecl *VD = C.getCapturedVar();
3365	const auto *RD = VD->getType()
3366	.getCanonicalType()
3367	.getNonReferenceType()
3368	->getAsCXXRecordDecl();
3369	if (!RD \|\| !RD->isLambda())
3370	continue;
3371	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3372	LValue VDLVal;
3373	if (VD->getType().getCanonicalType()->isReferenceType())
3374	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3375	else
3376	VDLVal = CGF.MakeAddrLValue(
3377	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3378	llvm::DenseMap<const ValueDecl , FieldDecl > Captures;
3379	FieldDecl ThisCapture = nullptr*;
3380	RD->getCaptureFields(Captures, ThisCapture);
3381	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3382	LValue ThisLVal =
3383	CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
3384	llvm::Value *CXXThis = CGF.LoadCXXThis();
3385	CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
3386	}
3387	for (const LambdaCapture &LC : RD->captures()) {
3388	if (LC.getCaptureKind() != LCK_ByRef)
3389	continue;
3390	const ValueDecl *VD = LC.getCapturedVar();
3391	// FIXME: For now VD is always a VarDecl because OpenMP does not support
3392	// capturing structured bindings in lambdas yet.
3393	if (!CS->capturesVariable(cast<VarDecl>(VD)))
3394	continue;
3395	auto It = Captures.find(VD);
3396	assert(It != Captures.end() && "Found lambda capture without field.");
3397	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3398	Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3399	if (VD->getType().getCanonicalType()->isReferenceType())
3400	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3401	VD->getType().getCanonicalType())
3402	.getAddress(CGF);
3403	CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3404	}
3405	}
3406	}
3407
3408	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3409	LangAS &AS) {
3410	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
3411	return false;
3412	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3413	switch(A->getAllocatorType()) {
3414	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3415	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3416	// Not supported, fallback to the default mem space.
3417	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3418	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3419	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3420	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3421	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3422	AS = LangAS::Default;
3423	return true;
3424	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3425	AS = LangAS::cuda_constant;
3426	return true;
3427	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3428	AS = LangAS::cuda_shared;
3429	return true;
3430	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3431	llvm_unreachable("Expected predefined allocator for the variables with the "
3432	"static storage.");
3433	}
3434	return false;
3435	}
3436
3437	// Get current CudaArch and ignore any unknown values
3438	static CudaArch getCudaArch(CodeGenModule &CGM) {
3439	if (!CGM.getTarget().hasFeature(Feature: "ptx"))
3440	return CudaArch::UNKNOWN;
3441	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3442	if (Feature.getValue()) {
3443	CudaArch Arch = StringToCudaArch(S: Feature.getKey());
3444	if (Arch != CudaArch::UNKNOWN)
3445	return Arch;
3446	}
3447	}
3448	return CudaArch::UNKNOWN;
3449	}
3450
3451	/// Check to see if target architecture supports unified addressing which is
3452	/// a restriction for OpenMP requires clause "unified_shared_memory".
3453	void CGOpenMPRuntimeGPU::processRequiresDirective(
3454	const OMPRequiresDecl *D) {
3455	for (const OMPClause *Clause : D->clauselists()) {
3456	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3457	CudaArch Arch = getCudaArch(CGM);
3458	switch (Arch) {
3459	case CudaArch::SM_20:
3460	case CudaArch::SM_21:
3461	case CudaArch::SM_30:
3462	case CudaArch::SM_32:
3463	case CudaArch::SM_35:
3464	case CudaArch::SM_37:
3465	case CudaArch::SM_50:
3466	case CudaArch::SM_52:
3467	case CudaArch::SM_53: {
3468	SmallString<`256`> Buffer;
3469	llvm::raw_svector_ostream Out(Buffer);
3470	Out << "Target architecture " << CudaArchToString(A: Arch)
3471	<< " does not support unified addressing";
3472	CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
3473	return;
3474	}
3475	case CudaArch::SM_60:
3476	case CudaArch::SM_61:
3477	case CudaArch::SM_62:
3478	case CudaArch::SM_70:
3479	case CudaArch::SM_72:
3480	case CudaArch::SM_75:
3481	case CudaArch::SM_80:
3482	case CudaArch::SM_86:
3483	case CudaArch::SM_87:
3484	case CudaArch::SM_89:
3485	case CudaArch::SM_90:
3486	case CudaArch::SM_90a:
3487	case CudaArch::GFX600:
3488	case CudaArch::GFX601:
3489	case CudaArch::GFX602:
3490	case CudaArch::GFX700:
3491	case CudaArch::GFX701:
3492	case CudaArch::GFX702:
3493	case CudaArch::GFX703:
3494	case CudaArch::GFX704:
3495	case CudaArch::GFX705:
3496	case CudaArch::GFX801:
3497	case CudaArch::GFX802:
3498	case CudaArch::GFX803:
3499	case CudaArch::GFX805:
3500	case CudaArch::GFX810:
3501	case CudaArch::GFX900:
3502	case CudaArch::GFX902:
3503	case CudaArch::GFX904:
3504	case CudaArch::GFX906:
3505	case CudaArch::GFX908:
3506	case CudaArch::GFX909:
3507	case CudaArch::GFX90a:
3508	case CudaArch::GFX90c:
3509	case CudaArch::GFX940:
3510	case CudaArch::GFX941:
3511	case CudaArch::GFX942:
3512	case CudaArch::GFX1010:
3513	case CudaArch::GFX1011:
3514	case CudaArch::GFX1012:
3515	case CudaArch::GFX1013:
3516	case CudaArch::GFX1030:
3517	case CudaArch::GFX1031:
3518	case CudaArch::GFX1032:
3519	case CudaArch::GFX1033:
3520	case CudaArch::GFX1034:
3521	case CudaArch::GFX1035:
3522	case CudaArch::GFX1036:
3523	case CudaArch::GFX1100:
3524	case CudaArch::GFX1101:
3525	case CudaArch::GFX1102:
3526	case CudaArch::GFX1103:
3527	case CudaArch::GFX1150:
3528	case CudaArch::GFX1151:
3529	case CudaArch::GFX1200:
3530	case CudaArch::GFX1201:
3531	case CudaArch::Generic:
3532	case CudaArch::UNUSED:
3533	case CudaArch::UNKNOWN:
3534	break;
3535	case CudaArch::LAST:
3536	llvm_unreachable("Unexpected Cuda arch.");
3537	}
3538	}
3539	}
3540	CGOpenMPRuntime::processRequiresDirective(D);
3541	}
3542
3543	llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3544	CGBuilderTy &Bld = CGF.Builder;
3545	llvm::Module *M = &CGF.CGM.getModule();
3546	const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3547	llvm::Function *F = M->getFunction(Name: LocSize);
3548	if (!F) {
3549	F = llvm::Function::Create(
3550	Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
3551	Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
3552	}
3553	return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
3554	}
3555
3556	llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3557	ArrayRef<llvm::Value *> Args{};
3558	return CGF.EmitRuntimeCall(
3559	callee: OMPBuilder.getOrCreateRuntimeFunction(
3560	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
3561	args: Args);
3562	}
3563
3564	llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3565	ArrayRef<llvm::Value *> Args{};
3566	return CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3567	M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_warp_size),
3568	args: Args);
3569	}
3570

source code of clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp