1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/OpenMPClause.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(callee: EnterCallee, args: EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(Arg: EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock(name: "omp_if.then");
52 ContBlock = CGF.createBasicBlock(name: "omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(Cond: CallBool, True: ThenBlock, False: ContBlock);
55 CGF.EmitBlock(BB: ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(Block: ContBlock);
61 CGF.EmitBlock(BB: ContBlock, IsFinished: true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(callee: ExitCallee, args: ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89 RefExpr = RefExpr->IgnoreParens();
90 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Val: RefExpr)) {
91 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
93 Base = TempASE->getBase()->IgnoreParenImpCasts();
94 RefExpr = Base;
95 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(Val: RefExpr)) {
96 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Val: Base))
98 Base = TempOASE->getBase()->IgnoreParenImpCasts();
99 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Val: Base))
100 Base = TempASE->getBase()->IgnoreParenImpCasts();
101 RefExpr = Base;
102 }
103 RefExpr = RefExpr->IgnoreParenImpCasts();
104 if (const auto *DE = dyn_cast<DeclRefExpr>(Val: RefExpr))
105 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106 const auto *ME = cast<MemberExpr>(Val: RefExpr);
107 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108}
109
110static RecordDecl *buildRecordForGlobalizedVars(
111 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114 &MappedDeclsFields,
115 int BufSize) {
116 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118 return nullptr;
119 SmallVector<VarsDataTy, 4> GlobalizedVars;
120 for (const ValueDecl *D : EscapedDecls)
121 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
122 for (const ValueDecl *D : EscapedDeclsForTeams)
123 GlobalizedVars.emplace_back(Args: C.getDeclAlign(D), Args&: D);
124
125 // Build struct _globalized_locals_ty {
126 // /* globalized vars */[WarSize] align (decl_align)
127 // /* globalized vars */ for EscapedDeclsForTeams
128 // };
129 RecordDecl *GlobalizedRD = C.buildImplicitRecord(Name: "_globalized_locals_ty");
130 GlobalizedRD->startDefinition();
131 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
132 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133 for (const auto &Pair : GlobalizedVars) {
134 const ValueDecl *VD = Pair.second;
135 QualType Type = VD->getType();
136 if (Type->isLValueReferenceType())
137 Type = C.getPointerType(T: Type.getNonReferenceType());
138 else
139 Type = Type.getNonReferenceType();
140 SourceLocation Loc = VD->getLocation();
141 FieldDecl *Field;
142 if (SingleEscaped.count(Ptr: VD)) {
143 Field = FieldDecl::Create(
144 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
145 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
146 /*BW=*/nullptr, /*Mutable=*/false,
147 /*InitStyle=*/ICIS_NoInit);
148 Field->setAccess(AS_public);
149 if (VD->hasAttrs()) {
150 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151 E(VD->getAttrs().end());
152 I != E; ++I)
153 Field->addAttr(*I);
154 }
155 } else {
156 if (BufSize > 1) {
157 llvm::APInt ArraySize(32, BufSize);
158 Type = C.getConstantArrayType(EltTy: Type, ArySize: ArraySize, SizeExpr: nullptr,
159 ASM: ArraySizeModifier::Normal, IndexTypeQuals: 0);
160 }
161 Field = FieldDecl::Create(
162 C, DC: GlobalizedRD, StartLoc: Loc, IdLoc: Loc, Id: VD->getIdentifier(), T: Type,
163 TInfo: C.getTrivialTypeSourceInfo(T: Type, Loc: SourceLocation()),
164 /*BW=*/nullptr, /*Mutable=*/false,
165 /*InitStyle=*/ICIS_NoInit);
166 Field->setAccess(AS_public);
167 llvm::APInt Align(32, Pair.first.getQuantity());
168 Field->addAttr(AlignedAttr::CreateImplicit(
169 C, /*IsAlignmentExpr=*/true,
170 IntegerLiteral::Create(C, Align,
171 C.getIntTypeForBitwidth(32, /*Signed=*/0),
172 SourceLocation()),
173 {}, AlignedAttr::GNU_aligned));
174 }
175 GlobalizedRD->addDecl(Field);
176 MappedDeclsFields.try_emplace(Key: VD, Args&: Field);
177 }
178 GlobalizedRD->completeDefinition();
179 return GlobalizedRD;
180}
181
182/// Get the list of variables that can escape their declaration context.
183class CheckVarsEscapingDeclContext final
184 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185 CodeGenFunction &CGF;
186 llvm::SetVector<const ValueDecl *> EscapedDecls;
187 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190 RecordDecl *GlobalizedRD = nullptr;
191 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192 bool AllEscaped = false;
193 bool IsForCombinedParallelRegion = false;
194
195 void markAsEscaped(const ValueDecl *VD) {
196 // Do not globalize declare target variables.
197 if (!isa<VarDecl>(VD) ||
198 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199 return;
200 VD = cast<ValueDecl>(VD->getCanonicalDecl());
201 // Use user-specified allocation.
202 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203 return;
204 // Variables captured by value must be globalized.
205 bool IsCaptured = false;
206 if (auto *CSI = CGF.CapturedStmtInfo) {
207 if (const FieldDecl *FD = CSI->lookup(VD: cast<VarDecl>(Val: VD))) {
208 // Check if need to capture the variable that was already captured by
209 // value in the outer region.
210 IsCaptured = true;
211 if (!IsForCombinedParallelRegion) {
212 if (!FD->hasAttrs())
213 return;
214 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215 if (!Attr)
216 return;
217 if (((Attr->getCaptureKind() != OMPC_map) &&
218 !isOpenMPPrivate(Attr->getCaptureKind())) ||
219 ((Attr->getCaptureKind() == OMPC_map) &&
220 !FD->getType()->isAnyPointerType()))
221 return;
222 }
223 if (!FD->getType()->isReferenceType()) {
224 assert(!VD->getType()->isVariablyModifiedType() &&
225 "Parameter captured by value with variably modified type");
226 EscapedParameters.insert(VD);
227 } else if (!IsForCombinedParallelRegion) {
228 return;
229 }
230 }
231 }
232 if ((!CGF.CapturedStmtInfo ||
233 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234 VD->getType()->isReferenceType())
235 // Do not globalize variables with reference type.
236 return;
237 if (VD->getType()->isVariablyModifiedType()) {
238 // If not captured at the target region level then mark the escaped
239 // variable as delayed.
240 if (IsCaptured)
241 EscapedVariableLengthDecls.insert(X: VD);
242 else
243 DelayedVariableLengthDecls.insert(X: VD);
244 } else
245 EscapedDecls.insert(X: VD);
246 }
247
248 void VisitValueDecl(const ValueDecl *VD) {
249 if (VD->getType()->isLValueReferenceType())
250 markAsEscaped(VD);
251 if (const auto *VarD = dyn_cast<VarDecl>(Val: VD)) {
252 if (!isa<ParmVarDecl>(Val: VarD) && VarD->hasInit()) {
253 const bool SavedAllEscaped = AllEscaped;
254 AllEscaped = VD->getType()->isLValueReferenceType();
255 Visit(VarD->getInit());
256 AllEscaped = SavedAllEscaped;
257 }
258 }
259 }
260 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261 ArrayRef<OMPClause *> Clauses,
262 bool IsCombinedParallelRegion) {
263 if (!S)
264 return;
265 for (const CapturedStmt::Capture &C : S->captures()) {
266 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267 const ValueDecl *VD = C.getCapturedVar();
268 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269 if (IsCombinedParallelRegion) {
270 // Check if the variable is privatized in the combined construct and
271 // those private copies must be shared in the inner parallel
272 // directive.
273 IsForCombinedParallelRegion = false;
274 for (const OMPClause *C : Clauses) {
275 if (!isOpenMPPrivate(C->getClauseKind()) ||
276 C->getClauseKind() == OMPC_reduction ||
277 C->getClauseKind() == OMPC_linear ||
278 C->getClauseKind() == OMPC_private)
279 continue;
280 ArrayRef<const Expr *> Vars;
281 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(Val: C))
282 Vars = PC->getVarRefs();
283 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(Val: C))
284 Vars = PC->getVarRefs();
285 else
286 llvm_unreachable("Unexpected clause.");
287 for (const auto *E : Vars) {
288 const Decl *D =
289 cast<DeclRefExpr>(Val: E)->getDecl()->getCanonicalDecl();
290 if (D == VD->getCanonicalDecl()) {
291 IsForCombinedParallelRegion = true;
292 break;
293 }
294 }
295 if (IsForCombinedParallelRegion)
296 break;
297 }
298 }
299 markAsEscaped(VD);
300 if (isa<OMPCapturedExprDecl>(Val: VD))
301 VisitValueDecl(VD);
302 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303 }
304 }
305 }
306
307 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308 assert(!GlobalizedRD &&
309 "Record for globalized variables is built already.");
310 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312 if (IsInTTDRegion)
313 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314 else
315 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316 GlobalizedRD = ::buildRecordForGlobalizedVars(
317 C&: CGF.getContext(), EscapedDecls: EscapedDeclsForParallel, EscapedDeclsForTeams,
318 MappedDeclsFields, BufSize: WarpSize);
319 }
320
321public:
322 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323 ArrayRef<const ValueDecl *> TeamsReductions)
324 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325 }
326 virtual ~CheckVarsEscapingDeclContext() = default;
327 void VisitDeclStmt(const DeclStmt *S) {
328 if (!S)
329 return;
330 for (const Decl *D : S->decls())
331 if (const auto *VD = dyn_cast_or_null<ValueDecl>(Val: D))
332 VisitValueDecl(VD);
333 }
334 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335 if (!D)
336 return;
337 if (!D->hasAssociatedStmt())
338 return;
339 if (const auto *S =
340 dyn_cast_or_null<CapturedStmt>(Val: D->getAssociatedStmt())) {
341 // Do not analyze directives that do not actually require capturing,
342 // like `omp for` or `omp simd` directives.
343 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
344 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346 VisitStmt(S: S->getCapturedStmt());
347 return;
348 }
349 VisitOpenMPCapturedStmt(
350 S, D->clauses(),
351 CaptureRegions.back() == OMPD_parallel &&
352 isOpenMPDistributeDirective(D->getDirectiveKind()));
353 }
354 }
355 void VisitCapturedStmt(const CapturedStmt *S) {
356 if (!S)
357 return;
358 for (const CapturedStmt::Capture &C : S->captures()) {
359 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360 const ValueDecl *VD = C.getCapturedVar();
361 markAsEscaped(VD);
362 if (isa<OMPCapturedExprDecl>(Val: VD))
363 VisitValueDecl(VD);
364 }
365 }
366 }
367 void VisitLambdaExpr(const LambdaExpr *E) {
368 if (!E)
369 return;
370 for (const LambdaCapture &C : E->captures()) {
371 if (C.capturesVariable()) {
372 if (C.getCaptureKind() == LCK_ByRef) {
373 const ValueDecl *VD = C.getCapturedVar();
374 markAsEscaped(VD);
375 if (E->isInitCapture(Capture: &C) || isa<OMPCapturedExprDecl>(Val: VD))
376 VisitValueDecl(VD);
377 }
378 }
379 }
380 }
381 void VisitBlockExpr(const BlockExpr *E) {
382 if (!E)
383 return;
384 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385 if (C.isByRef()) {
386 const VarDecl *VD = C.getVariable();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(Val: VD) || VD->isInitCapture())
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitCallExpr(const CallExpr *E) {
394 if (!E)
395 return;
396 for (const Expr *Arg : E->arguments()) {
397 if (!Arg)
398 continue;
399 if (Arg->isLValue()) {
400 const bool SavedAllEscaped = AllEscaped;
401 AllEscaped = true;
402 Visit(Arg);
403 AllEscaped = SavedAllEscaped;
404 } else {
405 Visit(Arg);
406 }
407 }
408 Visit(E->getCallee());
409 }
410 void VisitDeclRefExpr(const DeclRefExpr *E) {
411 if (!E)
412 return;
413 const ValueDecl *VD = E->getDecl();
414 if (AllEscaped)
415 markAsEscaped(VD);
416 if (isa<OMPCapturedExprDecl>(Val: VD))
417 VisitValueDecl(VD);
418 else if (VD->isInitCapture())
419 VisitValueDecl(VD);
420 }
421 void VisitUnaryOperator(const UnaryOperator *E) {
422 if (!E)
423 return;
424 if (E->getOpcode() == UO_AddrOf) {
425 const bool SavedAllEscaped = AllEscaped;
426 AllEscaped = true;
427 Visit(E->getSubExpr());
428 AllEscaped = SavedAllEscaped;
429 } else {
430 Visit(E->getSubExpr());
431 }
432 }
433 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434 if (!E)
435 return;
436 if (E->getCastKind() == CK_ArrayToPointerDecay) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitExpr(const Expr *E) {
446 if (!E)
447 return;
448 bool SavedAllEscaped = AllEscaped;
449 if (!E->isLValue())
450 AllEscaped = false;
451 for (const Stmt *Child : E->children())
452 if (Child)
453 Visit(Child);
454 AllEscaped = SavedAllEscaped;
455 }
456 void VisitStmt(const Stmt *S) {
457 if (!S)
458 return;
459 for (const Stmt *Child : S->children())
460 if (Child)
461 Visit(Child);
462 }
463
464 /// Returns the record that handles all the escaped local variables and used
465 /// instead of their original storage.
466 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467 if (!GlobalizedRD)
468 buildRecordForGlobalizedVars(IsInTTDRegion);
469 return GlobalizedRD;
470 }
471
472 /// Returns the field in the globalized record for the escaped variable.
473 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474 assert(GlobalizedRD &&
475 "Record for globalized variables must be generated already.");
476 return MappedDeclsFields.lookup(Val: VD);
477 }
478
479 /// Returns the list of the escaped local variables/parameters.
480 ArrayRef<const ValueDecl *> getEscapedDecls() const {
481 return EscapedDecls.getArrayRef();
482 }
483
484 /// Checks if the escaped local variable is actually a parameter passed by
485 /// value.
486 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487 return EscapedParameters;
488 }
489
490 /// Returns the list of the escaped variables with the variably modified
491 /// types.
492 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493 return EscapedVariableLengthDecls.getArrayRef();
494 }
495
496 /// Returns the list of the delayed variables with the variably modified
497 /// types.
498 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499 return DelayedVariableLengthDecls.getArrayRef();
500 }
501};
502} // anonymous namespace
503
504/// Get the id of the warp in the block.
505/// We assume that the warp size is 32, which is always the case
506/// on the NVPTX device, to generate more efficient code.
507static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508 CGBuilderTy &Bld = CGF.Builder;
509 unsigned LaneIDBits =
510 llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
511 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512 return Bld.CreateAShr(LHS: RT.getGPUThreadID(CGF), RHS: LaneIDBits, Name: "nvptx_warp_id");
513}
514
515/// Get the id of the current lane in the Warp.
516/// We assume that the warp size is 32, which is always the case
517/// on the NVPTX device, to generate more efficient code.
518static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519 CGBuilderTy &Bld = CGF.Builder;
520 unsigned LaneIDBits =
521 llvm::Log2_32(Value: CGF.getTarget().getGridValue().GV_Warp_Size);
522 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
523 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
524 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525 return Bld.CreateAnd(LHS: RT.getGPUThreadID(CGF), RHS: Bld.getInt32(C: LaneIDMask),
526 Name: "nvptx_lane_id");
527}
528
529CGOpenMPRuntimeGPU::ExecutionMode
530CGOpenMPRuntimeGPU::getExecutionMode() const {
531 return CurrentExecutionMode;
532}
533
534CGOpenMPRuntimeGPU::DataSharingMode
535CGOpenMPRuntimeGPU::getDataSharingMode() const {
536 return CurrentDataSharingMode;
537}
538
539/// Check for inner (nested) SPMD construct, if any
540static bool hasNestedSPMDDirective(ASTContext &Ctx,
541 const OMPExecutableDirective &D) {
542 const auto *CS = D.getInnermostCapturedStmt();
543 const auto *Body =
544 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
545 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546
547 if (const auto *NestedDir =
548 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
549 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550 switch (D.getDirectiveKind()) {
551 case OMPD_target:
552 if (isOpenMPParallelDirective(DKind))
553 return true;
554 if (DKind == OMPD_teams) {
555 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556 /*IgnoreCaptured=*/true);
557 if (!Body)
558 return false;
559 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560 if (const auto *NND =
561 dyn_cast_or_null<OMPExecutableDirective>(Val: ChildStmt)) {
562 DKind = NND->getDirectiveKind();
563 if (isOpenMPParallelDirective(DKind))
564 return true;
565 }
566 }
567 return false;
568 case OMPD_target_teams:
569 return isOpenMPParallelDirective(DKind);
570 case OMPD_target_simd:
571 case OMPD_target_parallel:
572 case OMPD_target_parallel_for:
573 case OMPD_target_parallel_for_simd:
574 case OMPD_target_teams_distribute:
575 case OMPD_target_teams_distribute_simd:
576 case OMPD_target_teams_distribute_parallel_for:
577 case OMPD_target_teams_distribute_parallel_for_simd:
578 case OMPD_parallel:
579 case OMPD_for:
580 case OMPD_parallel_for:
581 case OMPD_parallel_master:
582 case OMPD_parallel_sections:
583 case OMPD_for_simd:
584 case OMPD_parallel_for_simd:
585 case OMPD_cancel:
586 case OMPD_cancellation_point:
587 case OMPD_ordered:
588 case OMPD_threadprivate:
589 case OMPD_allocate:
590 case OMPD_task:
591 case OMPD_simd:
592 case OMPD_sections:
593 case OMPD_section:
594 case OMPD_single:
595 case OMPD_master:
596 case OMPD_critical:
597 case OMPD_taskyield:
598 case OMPD_barrier:
599 case OMPD_taskwait:
600 case OMPD_taskgroup:
601 case OMPD_atomic:
602 case OMPD_flush:
603 case OMPD_depobj:
604 case OMPD_scan:
605 case OMPD_teams:
606 case OMPD_target_data:
607 case OMPD_target_exit_data:
608 case OMPD_target_enter_data:
609 case OMPD_distribute:
610 case OMPD_distribute_simd:
611 case OMPD_distribute_parallel_for:
612 case OMPD_distribute_parallel_for_simd:
613 case OMPD_teams_distribute:
614 case OMPD_teams_distribute_simd:
615 case OMPD_teams_distribute_parallel_for:
616 case OMPD_teams_distribute_parallel_for_simd:
617 case OMPD_target_update:
618 case OMPD_declare_simd:
619 case OMPD_declare_variant:
620 case OMPD_begin_declare_variant:
621 case OMPD_end_declare_variant:
622 case OMPD_declare_target:
623 case OMPD_end_declare_target:
624 case OMPD_declare_reduction:
625 case OMPD_declare_mapper:
626 case OMPD_taskloop:
627 case OMPD_taskloop_simd:
628 case OMPD_master_taskloop:
629 case OMPD_master_taskloop_simd:
630 case OMPD_parallel_master_taskloop:
631 case OMPD_parallel_master_taskloop_simd:
632 case OMPD_requires:
633 case OMPD_unknown:
634 default:
635 llvm_unreachable("Unexpected directive.");
636 }
637 }
638
639 return false;
640}
641
642static bool supportsSPMDExecutionMode(ASTContext &Ctx,
643 const OMPExecutableDirective &D) {
644 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
645 switch (DirectiveKind) {
646 case OMPD_target:
647 case OMPD_target_teams:
648 return hasNestedSPMDDirective(Ctx, D);
649 case OMPD_target_teams_loop:
650 case OMPD_target_parallel_loop:
651 case OMPD_target_parallel:
652 case OMPD_target_parallel_for:
653 case OMPD_target_parallel_for_simd:
654 case OMPD_target_teams_distribute_parallel_for:
655 case OMPD_target_teams_distribute_parallel_for_simd:
656 case OMPD_target_simd:
657 case OMPD_target_teams_distribute_simd:
658 return true;
659 case OMPD_target_teams_distribute:
660 return false;
661 case OMPD_parallel:
662 case OMPD_for:
663 case OMPD_parallel_for:
664 case OMPD_parallel_master:
665 case OMPD_parallel_sections:
666 case OMPD_for_simd:
667 case OMPD_parallel_for_simd:
668 case OMPD_cancel:
669 case OMPD_cancellation_point:
670 case OMPD_ordered:
671 case OMPD_threadprivate:
672 case OMPD_allocate:
673 case OMPD_task:
674 case OMPD_simd:
675 case OMPD_sections:
676 case OMPD_section:
677 case OMPD_single:
678 case OMPD_master:
679 case OMPD_critical:
680 case OMPD_taskyield:
681 case OMPD_barrier:
682 case OMPD_taskwait:
683 case OMPD_taskgroup:
684 case OMPD_atomic:
685 case OMPD_flush:
686 case OMPD_depobj:
687 case OMPD_scan:
688 case OMPD_teams:
689 case OMPD_target_data:
690 case OMPD_target_exit_data:
691 case OMPD_target_enter_data:
692 case OMPD_distribute:
693 case OMPD_distribute_simd:
694 case OMPD_distribute_parallel_for:
695 case OMPD_distribute_parallel_for_simd:
696 case OMPD_teams_distribute:
697 case OMPD_teams_distribute_simd:
698 case OMPD_teams_distribute_parallel_for:
699 case OMPD_teams_distribute_parallel_for_simd:
700 case OMPD_target_update:
701 case OMPD_declare_simd:
702 case OMPD_declare_variant:
703 case OMPD_begin_declare_variant:
704 case OMPD_end_declare_variant:
705 case OMPD_declare_target:
706 case OMPD_end_declare_target:
707 case OMPD_declare_reduction:
708 case OMPD_declare_mapper:
709 case OMPD_taskloop:
710 case OMPD_taskloop_simd:
711 case OMPD_master_taskloop:
712 case OMPD_master_taskloop_simd:
713 case OMPD_parallel_master_taskloop:
714 case OMPD_parallel_master_taskloop_simd:
715 case OMPD_requires:
716 case OMPD_unknown:
717 default:
718 break;
719 }
720 llvm_unreachable(
721 "Unknown programming model for OpenMP directive on NVPTX target.");
722}
723
724void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
725 StringRef ParentName,
726 llvm::Function *&OutlinedFn,
727 llvm::Constant *&OutlinedFnID,
728 bool IsOffloadEntry,
729 const RegionCodeGenTy &CodeGen) {
730 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
731 EntryFunctionState EST;
732 WrapperFunctionsMap.clear();
733
734 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
735 assert(!IsBareKernel && "bare kernel should not be at generic mode");
736
737 // Emit target region as a standalone region.
738 class NVPTXPrePostActionTy : public PrePostActionTy {
739 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
740 const OMPExecutableDirective &D;
741
742 public:
743 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
744 const OMPExecutableDirective &D)
745 : EST(EST), D(D) {}
746 void Enter(CodeGenFunction &CGF) override {
747 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
748 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
749 // Skip target region initialization.
750 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
751 }
752 void Exit(CodeGenFunction &CGF) override {
753 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
754 RT.clearLocThreadIdInsertPt(CGF);
755 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
756 }
757 } Action(EST, D);
758 CodeGen.setAction(Action);
759 IsInTTDRegion = true;
760 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
761 IsOffloadEntry, CodeGen);
762 IsInTTDRegion = false;
763}
764
765void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
766 CodeGenFunction &CGF,
767 EntryFunctionState &EST, bool IsSPMD) {
768 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
769 MaxTeamsVal = -1;
770 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
771 MinTeamsVal, MaxTeamsVal);
772
773 CGBuilderTy &Bld = CGF.Builder;
774 Bld.restoreIP(IP: OMPBuilder.createTargetInit(
775 Loc: Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
776 if (!IsSPMD)
777 emitGenericVarsProlog(CGF, Loc: EST.Loc);
778}
779
780void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
781 EntryFunctionState &EST,
782 bool IsSPMD) {
783 if (!IsSPMD)
784 emitGenericVarsEpilog(CGF);
785
786 // This is temporary until we remove the fixed sized buffer.
787 ASTContext &C = CGM.getContext();
788 RecordDecl *StaticRD = C.buildImplicitRecord(
789 Name: "_openmp_teams_reduction_type_$_", TK: RecordDecl::TagKind::Union);
790 StaticRD->startDefinition();
791 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
792 QualType RecTy = C.getRecordType(Decl: TeamReductionRec);
793 auto *Field = FieldDecl::Create(
794 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
795 C.getTrivialTypeSourceInfo(T: RecTy, Loc: SourceLocation()),
796 /*BW=*/nullptr, /*Mutable=*/false,
797 /*InitStyle=*/ICIS_NoInit);
798 Field->setAccess(AS_public);
799 StaticRD->addDecl(D: Field);
800 }
801 StaticRD->completeDefinition();
802 QualType StaticTy = C.getRecordType(Decl: StaticRD);
803 llvm::Type *LLVMReductionsBufferTy =
804 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
805 const auto &DL = CGM.getModule().getDataLayout();
806 uint64_t ReductionDataSize =
807 TeamsReductions.empty()
808 ? 0
809 : DL.getTypeAllocSize(Ty: LLVMReductionsBufferTy).getFixedValue();
810 CGBuilderTy &Bld = CGF.Builder;
811 OMPBuilder.createTargetDeinit(Loc: Bld, TeamsReductionDataSize: ReductionDataSize,
812 TeamsReductionBufferLength: C.getLangOpts().OpenMPCUDAReductionBufNum);
813 TeamsReductions.clear();
814}
815
816void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
817 StringRef ParentName,
818 llvm::Function *&OutlinedFn,
819 llvm::Constant *&OutlinedFnID,
820 bool IsOffloadEntry,
821 const RegionCodeGenTy &CodeGen) {
822 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
823 EntryFunctionState EST;
824
825 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
826
827 // Emit target region as a standalone region.
828 class NVPTXPrePostActionTy : public PrePostActionTy {
829 CGOpenMPRuntimeGPU &RT;
830 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
831 bool IsBareKernel;
832 DataSharingMode Mode;
833 const OMPExecutableDirective &D;
834
835 public:
836 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
837 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
838 bool IsBareKernel, const OMPExecutableDirective &D)
839 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
840 Mode(RT.CurrentDataSharingMode), D(D) {}
841 void Enter(CodeGenFunction &CGF) override {
842 if (IsBareKernel) {
843 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
844 return;
845 }
846 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
847 // Skip target region initialization.
848 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
849 }
850 void Exit(CodeGenFunction &CGF) override {
851 if (IsBareKernel) {
852 RT.CurrentDataSharingMode = Mode;
853 return;
854 }
855 RT.clearLocThreadIdInsertPt(CGF);
856 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
857 }
858 } Action(*this, EST, IsBareKernel, D);
859 CodeGen.setAction(Action);
860 IsInTTDRegion = true;
861 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
862 IsOffloadEntry, CodeGen);
863 IsInTTDRegion = false;
864}
865
866void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
867 const OMPExecutableDirective &D, StringRef ParentName,
868 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
869 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
870 if (!IsOffloadEntry) // Nothing to do.
871 return;
872
873 assert(!ParentName.empty() && "Invalid target region parent name!");
874
875 bool Mode = supportsSPMDExecutionMode(Ctx&: CGM.getContext(), D);
876 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
877 if (Mode || IsBareKernel)
878 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
879 CodeGen);
880 else
881 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
882 CodeGen);
883}
884
885CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
886 : CGOpenMPRuntime(CGM) {
887 llvm::OpenMPIRBuilderConfig Config(
888 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
889 CGM.getLangOpts().OpenMPOffloadMandatory,
890 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
891 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
892 OMPBuilder.setConfig(Config);
893
894 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
895 llvm_unreachable("OpenMP can only handle device code.");
896
897 if (CGM.getLangOpts().OpenMPCUDAMode)
898 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
899
900 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
901 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
902 return;
903
904 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTargetDebug,
905 Name: "__omp_rtl_debug_kind");
906 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPTeamSubscription,
907 Name: "__omp_rtl_assume_teams_oversubscription");
908 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPThreadSubscription,
909 Name: "__omp_rtl_assume_threads_oversubscription");
910 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoThreadState,
911 Name: "__omp_rtl_assume_no_thread_state");
912 OMPBuilder.createGlobalFlag(Value: CGM.getLangOpts().OpenMPNoNestedParallelism,
913 Name: "__omp_rtl_assume_no_nested_parallelism");
914}
915
916void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
917 ProcBindKind ProcBind,
918 SourceLocation Loc) {
919 // Nothing to do.
920}
921
922void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
923 llvm::Value *NumThreads,
924 SourceLocation Loc) {
925 // Nothing to do.
926}
927
928void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
929 const Expr *NumTeams,
930 const Expr *ThreadLimit,
931 SourceLocation Loc) {}
932
933llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
934 CodeGenFunction &CGF, const OMPExecutableDirective &D,
935 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
936 const RegionCodeGenTy &CodeGen) {
937 // Emit target region as a standalone region.
938 bool PrevIsInTTDRegion = IsInTTDRegion;
939 IsInTTDRegion = false;
940 auto *OutlinedFun =
941 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
942 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
943 IsInTTDRegion = PrevIsInTTDRegion;
944 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
945 llvm::Function *WrapperFun =
946 createParallelDataSharingWrapper(OutlinedParallelFn: OutlinedFun, D);
947 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
948 }
949
950 return OutlinedFun;
951}
952
953/// Get list of lastprivate variables from the teams distribute ... or
954/// teams {distribute ...} directives.
955static void
956getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
957 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
958 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
959 "expected teams directive.");
960 const OMPExecutableDirective *Dir = &D;
961 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
962 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
963 Ctx,
964 Body: D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
965 /*IgnoreCaptured=*/true))) {
966 Dir = dyn_cast_or_null<OMPExecutableDirective>(Val: S);
967 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
968 Dir = nullptr;
969 }
970 }
971 if (!Dir)
972 return;
973 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
974 for (const Expr *E : C->getVarRefs())
975 Vars.push_back(getPrivateItem(E));
976 }
977}
978
979/// Get list of reduction variables from the teams ... directives.
980static void
981getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
982 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
983 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
984 "expected teams directive.");
985 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
986 for (const Expr *E : C->privates())
987 Vars.push_back(Elt: getPrivateItem(RefExpr: E));
988 }
989}
990
991llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
992 CodeGenFunction &CGF, const OMPExecutableDirective &D,
993 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
994 const RegionCodeGenTy &CodeGen) {
995 SourceLocation Loc = D.getBeginLoc();
996
997 const RecordDecl *GlobalizedRD = nullptr;
998 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
999 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1000 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1001 // Globalize team reductions variable unconditionally in all modes.
1002 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1003 getTeamsReductionVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1004 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1005 getDistributeLastprivateVars(Ctx&: CGM.getContext(), D, Vars&: LastPrivatesReductions);
1006 if (!LastPrivatesReductions.empty()) {
1007 GlobalizedRD = ::buildRecordForGlobalizedVars(
1008 C&: CGM.getContext(), EscapedDecls: std::nullopt, EscapedDeclsForTeams: LastPrivatesReductions,
1009 MappedDeclsFields, BufSize: WarpSize);
1010 }
1011 } else if (!LastPrivatesReductions.empty()) {
1012 assert(!TeamAndReductions.first &&
1013 "Previous team declaration is not expected.");
1014 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1015 std::swap(LHS&: TeamAndReductions.second, RHS&: LastPrivatesReductions);
1016 }
1017
1018 // Emit target region as a standalone region.
1019 class NVPTXPrePostActionTy : public PrePostActionTy {
1020 SourceLocation &Loc;
1021 const RecordDecl *GlobalizedRD;
1022 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1023 &MappedDeclsFields;
1024
1025 public:
1026 NVPTXPrePostActionTy(
1027 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1028 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1029 &MappedDeclsFields)
1030 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1031 MappedDeclsFields(MappedDeclsFields) {}
1032 void Enter(CodeGenFunction &CGF) override {
1033 auto &Rt =
1034 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1035 if (GlobalizedRD) {
1036 auto I = Rt.FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
1037 I->getSecond().MappedParams =
1038 std::make_unique<CodeGenFunction::OMPMapVars>();
1039 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1040 for (const auto &Pair : MappedDeclsFields) {
1041 assert(Pair.getFirst()->isCanonicalDecl() &&
1042 "Expected canonical declaration");
1043 Data.insert(std::make_pair(x: Pair.getFirst(), y: MappedVarData()));
1044 }
1045 }
1046 Rt.emitGenericVarsProlog(CGF, Loc);
1047 }
1048 void Exit(CodeGenFunction &CGF) override {
1049 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1050 .emitGenericVarsEpilog(CGF);
1051 }
1052 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1053 CodeGen.setAction(Action);
1054 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1055 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1056
1057 return OutlinedFun;
1058}
1059
1060void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1061 SourceLocation Loc) {
1062 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1063 return;
1064
1065 CGBuilderTy &Bld = CGF.Builder;
1066
1067 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1068 if (I == FunctionGlobalizedDecls.end())
1069 return;
1070
1071 for (auto &Rec : I->getSecond().LocalVarData) {
1072 const auto *VD = cast<VarDecl>(Val: Rec.first);
1073 bool EscapedParam = I->getSecond().EscapedParameters.count(Ptr: Rec.first);
1074 QualType VarTy = VD->getType();
1075
1076 // Get the local allocation of a firstprivate variable before sharing
1077 llvm::Value *ParValue;
1078 if (EscapedParam) {
1079 LValue ParLVal =
1080 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1081 ParValue = CGF.EmitLoadOfScalar(lvalue: ParLVal, Loc);
1082 }
1083
1084 // Allocate space for the variable to be globalized
1085 llvm::Value *AllocArgs[] = {CGF.getTypeSize(Ty: VD->getType())};
1086 llvm::CallBase *VoidPtr =
1087 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1088 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1089 AllocArgs, VD->getName());
1090 // FIXME: We should use the variables actual alignment as an argument.
1091 VoidPtr->addRetAttr(llvm::Attribute::get(
1092 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1093 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1094
1095 // Cast the void pointer and get the address of the globalized variable.
1096 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(T: VarTy)->getPointerTo();
1097 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1098 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1099 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(V: CastedVoidPtr, T: VarTy);
1100 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1101 Rec.second.GlobalizedVal = VoidPtr;
1102
1103 // Assign the local allocation to the newly globalized location.
1104 if (EscapedParam) {
1105 CGF.EmitStoreOfScalar(value: ParValue, lvalue: VarAddr);
1106 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: VarAddr.getAddress(CGF));
1107 }
1108 if (auto *DI = CGF.getDebugInfo())
1109 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(Loc: VD->getLocation()));
1110 }
1111
1112 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1113 const auto *VD = cast<VarDecl>(Val: ValueD);
1114 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1115 getKmpcAllocShared(CGF, VD);
1116 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(Args&: AddrSizePair);
1117 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1118 CGM.getContext().getDeclAlign(VD),
1119 AlignmentSource::Decl);
1120 I->getSecond().MappedParams->setVarAddr(CGF, LocalVD: VD, TempAddr: Base.getAddress(CGF));
1121 }
1122 I->getSecond().MappedParams->apply(CGF);
1123}
1124
1125bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,
1126 const VarDecl *VD) const {
1127 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1128 if (I == FunctionGlobalizedDecls.end())
1129 return false;
1130
1131 // Check variable declaration is delayed:
1132 return llvm::is_contained(Range: I->getSecond().DelayedVariableLengthDecls, Element: VD);
1133}
1134
1135std::pair<llvm::Value *, llvm::Value *>
1136CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,
1137 const VarDecl *VD) {
1138 CGBuilderTy &Bld = CGF.Builder;
1139
1140 // Compute size and alignment.
1141 llvm::Value *Size = CGF.getTypeSize(Ty: VD->getType());
1142 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1143 Size = Bld.CreateNUWAdd(
1144 LHS: Size, RHS: llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity() - 1));
1145 llvm::Value *AlignVal =
1146 llvm::ConstantInt::get(Ty: CGF.SizeTy, V: Align.getQuantity());
1147 Size = Bld.CreateUDiv(LHS: Size, RHS: AlignVal);
1148 Size = Bld.CreateNUWMul(LHS: Size, RHS: AlignVal);
1149
1150 // Allocate space for this VLA object to be globalized.
1151 llvm::Value *AllocArgs[] = {Size};
1152 llvm::CallBase *VoidPtr =
1153 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1154 M&: CGM.getModule(), FnID: OMPRTL___kmpc_alloc_shared),
1155 AllocArgs, VD->getName());
1156 VoidPtr->addRetAttr(llvm::Attribute::get(
1157 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1158
1159 return std::make_pair(x&: VoidPtr, y&: Size);
1160}
1161
1162void CGOpenMPRuntimeGPU::getKmpcFreeShared(
1163 CodeGenFunction &CGF,
1164 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1165 // Deallocate the memory for each globalized VLA object
1166 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1167 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1168 args: {AddrSizePair.first, AddrSizePair.second});
1169}
1170
1171void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1172 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1173 return;
1174
1175 const auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
1176 if (I != FunctionGlobalizedDecls.end()) {
1177 // Deallocate the memory for each globalized VLA object that was
1178 // globalized in the prolog (i.e. emitGenericVarsProlog).
1179 for (const auto &AddrSizePair :
1180 llvm::reverse(C&: I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1181 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1182 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1183 args: {AddrSizePair.first, AddrSizePair.second});
1184 }
1185 // Deallocate the memory for each globalized value
1186 for (auto &Rec : llvm::reverse(C&: I->getSecond().LocalVarData)) {
1187 const auto *VD = cast<VarDecl>(Val: Rec.first);
1188 I->getSecond().MappedParams->restore(CGF);
1189
1190 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1191 CGF.getTypeSize(Ty: VD->getType())};
1192 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1193 M&: CGM.getModule(), FnID: OMPRTL___kmpc_free_shared),
1194 FreeArgs);
1195 }
1196 }
1197}
1198
1199void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1200 const OMPExecutableDirective &D,
1201 SourceLocation Loc,
1202 llvm::Function *OutlinedFn,
1203 ArrayRef<llvm::Value *> CapturedVars) {
1204 if (!CGF.HaveInsertPoint())
1205 return;
1206
1207 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1208
1209 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
1210 /*Name=*/".zero.addr");
1211 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
1212 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1213 // We don't emit any thread id function call in bare kernel, but because the
1214 // outlined function has a pointer argument, we emit a nullptr here.
1215 if (IsBareKernel)
1216 OutlinedFnArgs.push_back(Elt: llvm::ConstantPointerNull::get(T: CGM.VoidPtrTy));
1217 else
1218 OutlinedFnArgs.push_back(Elt: emitThreadIDAddress(CGF, Loc).getPointer());
1219 OutlinedFnArgs.push_back(Elt: ZeroAddr.getPointer());
1220 OutlinedFnArgs.append(in_start: CapturedVars.begin(), in_end: CapturedVars.end());
1221 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, Args: OutlinedFnArgs);
1222}
1223
1224void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1225 SourceLocation Loc,
1226 llvm::Function *OutlinedFn,
1227 ArrayRef<llvm::Value *> CapturedVars,
1228 const Expr *IfCond,
1229 llvm::Value *NumThreads) {
1230 if (!CGF.HaveInsertPoint())
1231 return;
1232
1233 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1234 NumThreads](CodeGenFunction &CGF,
1235 PrePostActionTy &Action) {
1236 CGBuilderTy &Bld = CGF.Builder;
1237 llvm::Value *NumThreadsVal = NumThreads;
1238 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1239 llvm::Value *ID = llvm::ConstantPointerNull::get(T: CGM.Int8PtrTy);
1240 if (WFn)
1241 ID = Bld.CreateBitOrPointerCast(V: WFn, DestTy: CGM.Int8PtrTy);
1242 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(V: OutlinedFn, DestTy: CGM.Int8PtrTy);
1243
1244 // Create a private scope that will globalize the arguments
1245 // passed from the outside of the target region.
1246 // TODO: Is that needed?
1247 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1248
1249 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1250 Ty: llvm::ArrayType::get(ElementType: CGM.VoidPtrTy, NumElements: CapturedVars.size()),
1251 Name: "captured_vars_addrs");
1252 // There's something to share.
1253 if (!CapturedVars.empty()) {
1254 // Prepare for parallel region. Indicate the outlined function.
1255 ASTContext &Ctx = CGF.getContext();
1256 unsigned Idx = 0;
1257 for (llvm::Value *V : CapturedVars) {
1258 Address Dst = Bld.CreateConstArrayGEP(Addr: CapturedVarsAddrs, Index: Idx);
1259 llvm::Value *PtrV;
1260 if (V->getType()->isIntegerTy())
1261 PtrV = Bld.CreateIntToPtr(V, DestTy: CGF.VoidPtrTy);
1262 else
1263 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, DestTy: CGF.VoidPtrTy);
1264 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1265 Ctx.getPointerType(Ctx.VoidPtrTy));
1266 ++Idx;
1267 }
1268 }
1269
1270 llvm::Value *IfCondVal = nullptr;
1271 if (IfCond)
1272 IfCondVal = Bld.CreateIntCast(V: CGF.EvaluateExprAsBool(E: IfCond), DestTy: CGF.Int32Ty,
1273 /* isSigned */ false);
1274 else
1275 IfCondVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: 1);
1276
1277 if (!NumThreadsVal)
1278 NumThreadsVal = llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1);
1279 else
1280 NumThreadsVal = Bld.CreateZExtOrTrunc(V: NumThreadsVal, DestTy: CGF.Int32Ty),
1281
1282 assert(IfCondVal && "Expected a value");
1283 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1284 llvm::Value *Args[] = {
1285 RTLoc,
1286 getThreadID(CGF, Loc),
1287 IfCondVal,
1288 NumThreadsVal,
1289 llvm::ConstantInt::get(Ty: CGF.Int32Ty, V: -1),
1290 FnPtr,
1291 ID,
1292 Bld.CreateBitOrPointerCast(V: CapturedVarsAddrs.getPointer(),
1293 DestTy: CGF.VoidPtrPtrTy),
1294 llvm::ConstantInt::get(Ty: CGM.SizeTy, V: CapturedVars.size())};
1295 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1296 M&: CGM.getModule(), FnID: OMPRTL___kmpc_parallel_51),
1297 Args);
1298 };
1299
1300 RegionCodeGenTy RCG(ParallelGen);
1301 RCG(CGF);
1302}
1303
1304void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1305 // Always emit simple barriers!
1306 if (!CGF.HaveInsertPoint())
1307 return;
1308 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1309 // This function does not use parameters, so we can emit just default values.
1310 llvm::Value *Args[] = {
1311 llvm::ConstantPointerNull::get(
1312 T: cast<llvm::PointerType>(getIdentTyPointerTy())),
1313 llvm::ConstantInt::get(Ty: CGF.Int32Ty, /*V=*/0, /*isSigned=*/IsSigned: true)};
1314 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1315 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier_simple_spmd),
1316 Args);
1317}
1318
1319void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1320 SourceLocation Loc,
1321 OpenMPDirectiveKind Kind, bool,
1322 bool) {
1323 // Always emit simple barriers!
1324 if (!CGF.HaveInsertPoint())
1325 return;
1326 // Build call __kmpc_cancel_barrier(loc, thread_id);
1327 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1328 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1329 getThreadID(CGF, Loc)};
1330
1331 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1332 M&: CGM.getModule(), FnID: OMPRTL___kmpc_barrier),
1333 Args);
1334}
1335
1336void CGOpenMPRuntimeGPU::emitCriticalRegion(
1337 CodeGenFunction &CGF, StringRef CriticalName,
1338 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1339 const Expr *Hint) {
1340 llvm::BasicBlock *LoopBB = CGF.createBasicBlock(name: "omp.critical.loop");
1341 llvm::BasicBlock *TestBB = CGF.createBasicBlock(name: "omp.critical.test");
1342 llvm::BasicBlock *SyncBB = CGF.createBasicBlock(name: "omp.critical.sync");
1343 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "omp.critical.body");
1344 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: "omp.critical.exit");
1345
1346 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1347
1348 // Get the mask of active threads in the warp.
1349 llvm::Value *Mask = CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1350 M&: CGM.getModule(), FnID: OMPRTL___kmpc_warp_active_thread_mask));
1351 // Fetch team-local id of the thread.
1352 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1353
1354 // Get the width of the team.
1355 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1356
1357 // Initialize the counter variable for the loop.
1358 QualType Int32Ty =
1359 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1360 Address Counter = CGF.CreateMemTemp(T: Int32Ty, Name: "critical_counter");
1361 LValue CounterLVal = CGF.MakeAddrLValue(Addr: Counter, T: Int32Ty);
1362 CGF.EmitStoreOfScalar(value: llvm::Constant::getNullValue(Ty: CGM.Int32Ty), lvalue: CounterLVal,
1363 /*isInit=*/true);
1364
1365 // Block checks if loop counter exceeds upper bound.
1366 CGF.EmitBlock(BB: LoopBB);
1367 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1368 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(LHS: CounterVal, RHS: TeamWidth);
1369 CGF.Builder.CreateCondBr(Cond: CmpLoopBound, True: TestBB, False: ExitBB);
1370
1371 // Block tests which single thread should execute region, and which threads
1372 // should go straight to synchronisation point.
1373 CGF.EmitBlock(BB: TestBB);
1374 CounterVal = CGF.EmitLoadOfScalar(lvalue: CounterLVal, Loc);
1375 llvm::Value *CmpThreadToCounter =
1376 CGF.Builder.CreateICmpEQ(LHS: ThreadID, RHS: CounterVal);
1377 CGF.Builder.CreateCondBr(Cond: CmpThreadToCounter, True: BodyBB, False: SyncBB);
1378
1379 // Block emits the body of the critical region.
1380 CGF.EmitBlock(BB: BodyBB);
1381
1382 // Output the critical statement.
1383 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1384 Hint);
1385
1386 // After the body surrounded by the critical region, the single executing
1387 // thread will jump to the synchronisation point.
1388 // Block waits for all threads in current team to finish then increments the
1389 // counter variable and returns to the loop.
1390 CGF.EmitBlock(BB: SyncBB);
1391 // Reconverge active threads in the warp.
1392 (void)CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
1393 M&: CGM.getModule(), FnID: OMPRTL___kmpc_syncwarp),
1394 args: Mask);
1395
1396 llvm::Value *IncCounterVal =
1397 CGF.Builder.CreateNSWAdd(LHS: CounterVal, RHS: CGF.Builder.getInt32(C: 1));
1398 CGF.EmitStoreOfScalar(value: IncCounterVal, lvalue: CounterLVal);
1399 CGF.EmitBranch(Block: LoopBB);
1400
1401 // Block that is reached when all threads in the team complete the region.
1402 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
1403}
1404
1405/// Cast value to the specified type.
1406static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1407 QualType ValTy, QualType CastTy,
1408 SourceLocation Loc) {
1409 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1410 "Cast type must sized.");
1411 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1412 "Val type must sized.");
1413 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(T: CastTy);
1414 if (ValTy == CastTy)
1415 return Val;
1416 if (CGF.getContext().getTypeSizeInChars(T: ValTy) ==
1417 CGF.getContext().getTypeSizeInChars(T: CastTy))
1418 return CGF.Builder.CreateBitCast(V: Val, DestTy: LLVMCastTy);
1419 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1420 return CGF.Builder.CreateIntCast(V: Val, DestTy: LLVMCastTy,
1421 isSigned: CastTy->hasSignedIntegerRepresentation());
1422 Address CastItem = CGF.CreateMemTemp(T: CastTy);
1423 Address ValCastItem = CastItem.withElementType(ElemTy: Val->getType());
1424 CGF.EmitStoreOfScalar(Value: Val, Addr: ValCastItem, /*Volatile=*/false, Ty: ValTy,
1425 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1426 TBAAInfo: TBAAAccessInfo());
1427 return CGF.EmitLoadOfScalar(Addr: CastItem, /*Volatile=*/false, Ty: CastTy, Loc,
1428 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1429 TBAAInfo: TBAAAccessInfo());
1430}
1431
1432/// This function creates calls to one of two shuffle functions to copy
1433/// variables between lanes in a warp.
1434static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1435 llvm::Value *Elem,
1436 QualType ElemType,
1437 llvm::Value *Offset,
1438 SourceLocation Loc) {
1439 CodeGenModule &CGM = CGF.CGM;
1440 CGBuilderTy &Bld = CGF.Builder;
1441 CGOpenMPRuntimeGPU &RT =
1442 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1443 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1444
1445 CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1446 assert(Size.getQuantity() <= 8 &&
1447 "Unsupported bitwidth in shuffle instruction.");
1448
1449 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1450 ? OMPRTL___kmpc_shuffle_int32
1451 : OMPRTL___kmpc_shuffle_int64;
1452
1453 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1454 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1455 DestWidth: Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1456 llvm::Value *ElemCast = castValueToType(CGF, Val: Elem, ValTy: ElemType, CastTy, Loc);
1457 llvm::Value *WarpSize =
1458 Bld.CreateIntCast(V: RT.getGPUWarpSize(CGF), DestTy: CGM.Int16Ty, /*isSigned=*/true);
1459
1460 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1461 callee: OMPBuilder.getOrCreateRuntimeFunction(M&: CGM.getModule(), FnID: ShuffleFn),
1462 args: {ElemCast, Offset, WarpSize});
1463
1464 return castValueToType(CGF, Val: ShuffledVal, ValTy: CastTy, CastTy: ElemType, Loc);
1465}
1466
1467static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1468 Address DestAddr, QualType ElemType,
1469 llvm::Value *Offset, SourceLocation Loc) {
1470 CGBuilderTy &Bld = CGF.Builder;
1471
1472 CharUnits Size = CGF.getContext().getTypeSizeInChars(T: ElemType);
1473 // Create the loop over the big sized data.
1474 // ptr = (void*)Elem;
1475 // ptrEnd = (void*) Elem + 1;
1476 // Step = 8;
1477 // while (ptr + Step < ptrEnd)
1478 // shuffle((int64_t)*ptr);
1479 // Step = 4;
1480 // while (ptr + Step < ptrEnd)
1481 // shuffle((int32_t)*ptr);
1482 // ...
1483 Address ElemPtr = DestAddr;
1484 Address Ptr = SrcAddr;
1485 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1486 Addr: Bld.CreateConstGEP(Addr: SrcAddr, Index: 1), Ty: CGF.VoidPtrTy, ElementTy: CGF.Int8Ty);
1487 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1488 if (Size < CharUnits::fromQuantity(Quantity: IntSize))
1489 continue;
1490 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1491 DestWidth: CGF.getContext().toBits(CharSize: CharUnits::fromQuantity(Quantity: IntSize)),
1492 /*Signed=*/1);
1493 llvm::Type *IntTy = CGF.ConvertTypeForMem(T: IntType);
1494 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Addr: Ptr, Ty: IntTy->getPointerTo(),
1495 ElementTy: IntTy);
1496 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1497 Addr: ElemPtr, Ty: IntTy->getPointerTo(), ElementTy: IntTy);
1498 if (Size.getQuantity() / IntSize > 1) {
1499 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(name: ".shuffle.pre_cond");
1500 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".shuffle.then");
1501 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".shuffle.exit");
1502 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1503 CGF.EmitBlock(BB: PreCondBB);
1504 llvm::PHINode *PhiSrc =
1505 Bld.CreatePHI(Ty: Ptr.getType(), /*NumReservedValues=*/2);
1506 PhiSrc->addIncoming(V: Ptr.getPointer(), BB: CurrentBB);
1507 llvm::PHINode *PhiDest =
1508 Bld.CreatePHI(Ty: ElemPtr.getType(), /*NumReservedValues=*/2);
1509 PhiDest->addIncoming(V: ElemPtr.getPointer(), BB: CurrentBB);
1510 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1511 ElemPtr =
1512 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1513 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1514 ElemTy: CGF.Int8Ty, LHS: PtrEnd.getPointer(),
1515 RHS: Bld.CreatePointerBitCastOrAddrSpaceCast(V: Ptr.getPointer(),
1516 DestTy: CGF.VoidPtrTy));
1517 Bld.CreateCondBr(Cond: Bld.CreateICmpSGT(LHS: PtrDiff, RHS: Bld.getInt64(C: IntSize - 1)),
1518 True: ThenBB, False: ExitBB);
1519 CGF.EmitBlock(BB: ThenBB);
1520 llvm::Value *Res = createRuntimeShuffleFunction(
1521 CGF,
1522 Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /*Volatile=*/false, Ty: IntType, Loc,
1523 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1524 TBAAInfo: TBAAAccessInfo()),
1525 ElemType: IntType, Offset, Loc);
1526 CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /*Volatile=*/false, Ty: IntType,
1527 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1528 TBAAInfo: TBAAAccessInfo());
1529 Address LocalPtr = Bld.CreateConstGEP(Addr: Ptr, Index: 1);
1530 Address LocalElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: 1);
1531 PhiSrc->addIncoming(V: LocalPtr.getPointer(), BB: ThenBB);
1532 PhiDest->addIncoming(V: LocalElemPtr.getPointer(), BB: ThenBB);
1533 CGF.EmitBranch(Block: PreCondBB);
1534 CGF.EmitBlock(BB: ExitBB);
1535 } else {
1536 llvm::Value *Res = createRuntimeShuffleFunction(
1537 CGF,
1538 Elem: CGF.EmitLoadOfScalar(Addr: Ptr, /*Volatile=*/false, Ty: IntType, Loc,
1539 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1540 TBAAInfo: TBAAAccessInfo()),
1541 ElemType: IntType, Offset, Loc);
1542 CGF.EmitStoreOfScalar(Value: Res, Addr: ElemPtr, /*Volatile=*/false, Ty: IntType,
1543 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1544 TBAAInfo: TBAAAccessInfo());
1545 Ptr = Bld.CreateConstGEP(Addr: Ptr, Index: 1);
1546 ElemPtr = Bld.CreateConstGEP(Addr: ElemPtr, Index: 1);
1547 }
1548 Size = Size % IntSize;
1549 }
1550}
1551
1552namespace {
1553enum CopyAction : unsigned {
1554 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1555 // the warp using shuffle instructions.
1556 RemoteLaneToThread,
1557 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1558 ThreadCopy,
1559};
1560} // namespace
1561
1562struct CopyOptionsTy {
1563 llvm::Value *RemoteLaneOffset;
1564 llvm::Value *ScratchpadIndex;
1565 llvm::Value *ScratchpadWidth;
1566};
1567
1568/// Emit instructions to copy a Reduce list, which contains partially
1569/// aggregated values, in the specified direction.
1570static void emitReductionListCopy(
1571 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1572 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1573 CopyOptionsTy CopyOptions = {.RemoteLaneOffset: nullptr, .ScratchpadIndex: nullptr, .ScratchpadWidth: nullptr}) {
1574
1575 CodeGenModule &CGM = CGF.CGM;
1576 ASTContext &C = CGM.getContext();
1577 CGBuilderTy &Bld = CGF.Builder;
1578
1579 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1580
1581 // Iterates, element-by-element, through the source Reduce list and
1582 // make a copy.
1583 unsigned Idx = 0;
1584 for (const Expr *Private : Privates) {
1585 Address SrcElementAddr = Address::invalid();
1586 Address DestElementAddr = Address::invalid();
1587 Address DestElementPtrAddr = Address::invalid();
1588 // Should we shuffle in an element from a remote lane?
1589 bool ShuffleInElement = false;
1590 // Set to true to update the pointer in the dest Reduce list to a
1591 // newly created element.
1592 bool UpdateDestListPtr = false;
1593 QualType PrivatePtrType = C.getPointerType(T: Private->getType());
1594 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(T: PrivatePtrType);
1595
1596 switch (Action) {
1597 case RemoteLaneToThread: {
1598 // Step 1.1: Get the address for the src element in the Reduce list.
1599 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1600 SrcElementAddr = CGF.EmitLoadOfPointer(
1601 Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1602 PtrTy: PrivatePtrType->castAs<PointerType>());
1603
1604 // Step 1.2: Create a temporary to store the element in the destination
1605 // Reduce list.
1606 DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1607 DestElementAddr =
1608 CGF.CreateMemTemp(T: Private->getType(), Name: ".omp.reduction.element");
1609 ShuffleInElement = true;
1610 UpdateDestListPtr = true;
1611 break;
1612 }
1613 case ThreadCopy: {
1614 // Step 1.1: Get the address for the src element in the Reduce list.
1615 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(Addr: SrcBase, Index: Idx);
1616 SrcElementAddr = CGF.EmitLoadOfPointer(
1617 Ptr: SrcElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1618 PtrTy: PrivatePtrType->castAs<PointerType>());
1619
1620 // Step 1.2: Get the address for dest element. The destination
1621 // element has already been created on the thread's stack.
1622 DestElementPtrAddr = Bld.CreateConstArrayGEP(Addr: DestBase, Index: Idx);
1623 DestElementAddr = CGF.EmitLoadOfPointer(
1624 Ptr: DestElementPtrAddr.withElementType(ElemTy: PrivateLlvmPtrType),
1625 PtrTy: PrivatePtrType->castAs<PointerType>());
1626 break;
1627 }
1628 }
1629
1630 // Regardless of src and dest of copy, we emit the load of src
1631 // element as this is required in all directions
1632 SrcElementAddr = SrcElementAddr.withElementType(
1633 ElemTy: CGF.ConvertTypeForMem(T: Private->getType()));
1634 DestElementAddr =
1635 DestElementAddr.withElementType(ElemTy: SrcElementAddr.getElementType());
1636
1637 // Now that all active lanes have read the element in the
1638 // Reduce list, shuffle over the value from the remote lane.
1639 if (ShuffleInElement) {
1640 shuffleAndStore(CGF, SrcAddr: SrcElementAddr, DestAddr: DestElementAddr, ElemType: Private->getType(),
1641 Offset: RemoteLaneOffset, Loc: Private->getExprLoc());
1642 } else {
1643 switch (CGF.getEvaluationKind(T: Private->getType())) {
1644 case TEK_Scalar: {
1645 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1646 Addr: SrcElementAddr, /*Volatile=*/false, Ty: Private->getType(),
1647 Loc: Private->getExprLoc(), BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1648 TBAAInfo: TBAAAccessInfo());
1649 // Store the source element value to the dest element address.
1650 CGF.EmitStoreOfScalar(
1651 Value: Elem, Addr: DestElementAddr, /*Volatile=*/false, Ty: Private->getType(),
1652 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
1653 break;
1654 }
1655 case TEK_Complex: {
1656 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1657 src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1658 loc: Private->getExprLoc());
1659 CGF.EmitStoreOfComplex(
1660 V: Elem, dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1661 /*isInit=*/false);
1662 break;
1663 }
1664 case TEK_Aggregate:
1665 CGF.EmitAggregateCopy(
1666 Dest: CGF.MakeAddrLValue(Addr: DestElementAddr, T: Private->getType()),
1667 Src: CGF.MakeAddrLValue(Addr: SrcElementAddr, T: Private->getType()),
1668 EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
1669 break;
1670 }
1671 }
1672
1673 // Step 3.1: Modify reference in dest Reduce list as needed.
1674 // Modifying the reference in Reduce list to point to the newly
1675 // created element. The element is live in the current function
1676 // scope and that of functions it invokes (i.e., reduce_function).
1677 // RemoteReduceData[i] = (void*)&RemoteElem
1678 if (UpdateDestListPtr) {
1679 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1680 V: DestElementAddr.getPointer(), DestTy: CGF.VoidPtrTy),
1681 DestElementPtrAddr, /*Volatile=*/false,
1682 C.VoidPtrTy);
1683 }
1684
1685 ++Idx;
1686 }
1687}
1688
1689/// This function emits a helper that gathers Reduce lists from the first
1690/// lane of every active warp to lanes in the first warp.
1691///
1692/// void inter_warp_copy_func(void* reduce_data, num_warps)
1693/// shared smem[warp_size];
1694/// For all data entries D in reduce_data:
1695/// sync
1696/// If (I am the first lane in each warp)
1697/// Copy my local D to smem[warp_id]
1698/// sync
1699/// if (I am the first warp)
1700/// Copy smem[thread_id] to my local D
1701static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1702 ArrayRef<const Expr *> Privates,
1703 QualType ReductionArrayTy,
1704 SourceLocation Loc) {
1705 ASTContext &C = CGM.getContext();
1706 llvm::Module &M = CGM.getModule();
1707
1708 // ReduceList: thread local Reduce list.
1709 // At the stage of the computation when this function is called, partially
1710 // aggregated values reside in the first lane of every active warp.
1711 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1712 C.VoidPtrTy, ImplicitParamKind::Other);
1713 // NumWarps: number of warps active in the parallel region. This could
1714 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1715 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1716 C.getIntTypeForBitwidth(DestWidth: 32, /* Signed */ true),
1717 ImplicitParamKind::Other);
1718 FunctionArgList Args;
1719 Args.push_back(&ReduceListArg);
1720 Args.push_back(&NumWarpsArg);
1721
1722 const CGFunctionInfo &CGFI =
1723 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1724 auto *Fn = llvm::Function::Create(Ty: CGM.getTypes().GetFunctionType(Info: CGFI),
1725 Linkage: llvm::GlobalValue::InternalLinkage,
1726 N: "_omp_reduction_inter_warp_copy_func", M: &M);
1727 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
1728 Fn->setDoesNotRecurse();
1729 CodeGenFunction CGF(CGM);
1730 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
1731
1732 CGBuilderTy &Bld = CGF.Builder;
1733
1734 // This array is used as a medium to transfer, one reduce element at a time,
1735 // the data from the first lane of every warp to lanes in the first warp
1736 // in order to perform the final step of a reduction in a parallel region
1737 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1738 // for reduced latency, as well as to have a distinct copy for concurrently
1739 // executing target regions. The array is declared with common linkage so
1740 // as to be shared across compilation units.
1741 StringRef TransferMediumName =
1742 "__openmp_nvptx_data_transfer_temporary_storage";
1743 llvm::GlobalVariable *TransferMedium =
1744 M.getGlobalVariable(Name: TransferMediumName);
1745 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1746 if (!TransferMedium) {
1747 auto *Ty = llvm::ArrayType::get(ElementType: CGM.Int32Ty, NumElements: WarpSize);
1748 unsigned SharedAddressSpace = C.getTargetAddressSpace(AS: LangAS::cuda_shared);
1749 TransferMedium = new llvm::GlobalVariable(
1750 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1751 llvm::UndefValue::get(T: Ty), TransferMediumName,
1752 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1753 SharedAddressSpace);
1754 CGM.addCompilerUsedGlobal(GV: TransferMedium);
1755 }
1756
1757 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1758 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1759 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1760 // nvptx_lane_id = nvptx_id % warpsize
1761 llvm::Value *LaneID = getNVPTXLaneID(CGF);
1762 // nvptx_warp_id = nvptx_id / warpsize
1763 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1764
1765 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1766 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
1767 Address LocalReduceList(
1768 Bld.CreatePointerBitCastOrAddrSpaceCast(
1769 CGF.EmitLoadOfScalar(
1770 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1771 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
1772 ElemTy->getPointerTo()),
1773 ElemTy, CGF.getPointerAlign());
1774
1775 unsigned Idx = 0;
1776 for (const Expr *Private : Privates) {
1777 //
1778 // Warp master copies reduce element to transfer medium in __shared__
1779 // memory.
1780 //
1781 unsigned RealTySize =
1782 C.getTypeSizeInChars(T: Private->getType())
1783 .alignTo(Align: C.getTypeAlignInChars(T: Private->getType()))
1784 .getQuantity();
1785 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1786 unsigned NumIters = RealTySize / TySize;
1787 if (NumIters == 0)
1788 continue;
1789 QualType CType = C.getIntTypeForBitwidth(
1790 DestWidth: C.toBits(CharSize: CharUnits::fromQuantity(Quantity: TySize)), /*Signed=*/1);
1791 llvm::Type *CopyType = CGF.ConvertTypeForMem(T: CType);
1792 CharUnits Align = CharUnits::fromQuantity(Quantity: TySize);
1793 llvm::Value *Cnt = nullptr;
1794 Address CntAddr = Address::invalid();
1795 llvm::BasicBlock *PrecondBB = nullptr;
1796 llvm::BasicBlock *ExitBB = nullptr;
1797 if (NumIters > 1) {
1798 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1799 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(Ty: CGM.IntTy), CntAddr,
1800 /*Volatile=*/false, C.IntTy);
1801 PrecondBB = CGF.createBasicBlock(name: "precond");
1802 ExitBB = CGF.createBasicBlock(name: "exit");
1803 llvm::BasicBlock *BodyBB = CGF.createBasicBlock(name: "body");
1804 // There is no need to emit line number for unconditional branch.
1805 (void)ApplyDebugLocation::CreateEmpty(CGF);
1806 CGF.EmitBlock(BB: PrecondBB);
1807 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1808 llvm::Value *Cmp =
1809 Bld.CreateICmpULT(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, V: NumIters));
1810 Bld.CreateCondBr(Cond: Cmp, True: BodyBB, False: ExitBB);
1811 CGF.EmitBlock(BB: BodyBB);
1812 }
1813 // kmpc_barrier.
1814 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1815 /*EmitChecks=*/false,
1816 /*ForceSimpleCall=*/true);
1817 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
1818 llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
1819 llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
1820
1821 // if (lane_id == 0)
1822 llvm::Value *IsWarpMaster = Bld.CreateIsNull(Arg: LaneID, Name: "warp_master");
1823 Bld.CreateCondBr(Cond: IsWarpMaster, True: ThenBB, False: ElseBB);
1824 CGF.EmitBlock(BB: ThenBB);
1825
1826 // Reduce element = LocalReduceList[i]
1827 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1828 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1829 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1830 // elemptr = ((CopyType*)(elemptrptr)) + I
1831 Address ElemPtr(ElemPtrPtr, CopyType, Align);
1832 if (NumIters > 1)
1833 ElemPtr = Bld.CreateGEP(Addr: ElemPtr, Index: Cnt);
1834
1835 // Get pointer to location in transfer medium.
1836 // MediumPtr = &medium[warp_id]
1837 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1838 Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1839 IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), WarpID});
1840 // Casting to actual data type.
1841 // MediumPtr = (CopyType*)MediumPtrAddr;
1842 Address MediumPtr(MediumPtrVal, CopyType, Align);
1843
1844 // elem = *elemptr
1845 //*MediumPtr = elem
1846 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1847 Addr: ElemPtr, /*Volatile=*/false, Ty: CType, Loc,
1848 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
1849 // Store the source element value to the dest element address.
1850 CGF.EmitStoreOfScalar(Value: Elem, Addr: MediumPtr, /*Volatile=*/true, Ty: CType,
1851 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
1852 TBAAInfo: TBAAAccessInfo());
1853
1854 Bld.CreateBr(Dest: MergeBB);
1855
1856 CGF.EmitBlock(BB: ElseBB);
1857 Bld.CreateBr(Dest: MergeBB);
1858
1859 CGF.EmitBlock(BB: MergeBB);
1860
1861 // kmpc_barrier.
1862 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1863 /*EmitChecks=*/false,
1864 /*ForceSimpleCall=*/true);
1865
1866 //
1867 // Warp 0 copies reduce element from transfer medium.
1868 //
1869 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock(name: "then");
1870 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock(name: "else");
1871 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock(name: "ifcont");
1872
1873 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1874 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1875 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1876
1877 // Up to 32 threads in warp 0 are active.
1878 llvm::Value *IsActiveThread =
1879 Bld.CreateICmpULT(LHS: ThreadID, RHS: NumWarpsVal, Name: "is_active_thread");
1880 Bld.CreateCondBr(Cond: IsActiveThread, True: W0ThenBB, False: W0ElseBB);
1881
1882 CGF.EmitBlock(BB: W0ThenBB);
1883
1884 // SrcMediumPtr = &medium[tid]
1885 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1886 Ty: TransferMedium->getValueType(), Ptr: TransferMedium,
1887 IdxList: {llvm::Constant::getNullValue(Ty: CGM.Int64Ty), ThreadID});
1888 // SrcMediumVal = *SrcMediumPtr;
1889 Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1890
1891 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1892 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
1893 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1894 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1895 Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1896 if (NumIters > 1)
1897 TargetElemPtr = Bld.CreateGEP(Addr: TargetElemPtr, Index: Cnt);
1898
1899 // *TargetElemPtr = SrcMediumVal;
1900 llvm::Value *SrcMediumValue =
1901 CGF.EmitLoadOfScalar(Addr: SrcMediumPtr, /*Volatile=*/true, Ty: CType, Loc);
1902 CGF.EmitStoreOfScalar(Value: SrcMediumValue, Addr: TargetElemPtr, /*Volatile=*/false,
1903 Ty: CType);
1904 Bld.CreateBr(Dest: W0MergeBB);
1905
1906 CGF.EmitBlock(BB: W0ElseBB);
1907 Bld.CreateBr(Dest: W0MergeBB);
1908
1909 CGF.EmitBlock(BB: W0MergeBB);
1910
1911 if (NumIters > 1) {
1912 Cnt = Bld.CreateNSWAdd(LHS: Cnt, RHS: llvm::ConstantInt::get(Ty: CGM.IntTy, /*V=*/1));
1913 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1914 CGF.EmitBranch(Block: PrecondBB);
1915 (void)ApplyDebugLocation::CreateEmpty(CGF);
1916 CGF.EmitBlock(BB: ExitBB);
1917 }
1918 RealTySize %= TySize;
1919 }
1920 ++Idx;
1921 }
1922
1923 CGF.FinishFunction();
1924 return Fn;
1925}
1926
1927/// Emit a helper that reduces data across two OpenMP threads (lanes)
1928/// in the same warp. It uses shuffle instructions to copy over data from
1929/// a remote lane's stack. The reduction algorithm performed is specified
1930/// by the fourth parameter.
1931///
1932/// Algorithm Versions.
1933/// Full Warp Reduce (argument value 0):
1934/// This algorithm assumes that all 32 lanes are active and gathers
1935/// data from these 32 lanes, producing a single resultant value.
1936/// Contiguous Partial Warp Reduce (argument value 1):
1937/// This algorithm assumes that only a *contiguous* subset of lanes
1938/// are active. This happens for the last warp in a parallel region
1939/// when the user specified num_threads is not an integer multiple of
1940/// 32. This contiguous subset always starts with the zeroth lane.
1941/// Partial Warp Reduce (argument value 2):
1942/// This algorithm gathers data from any number of lanes at any position.
1943/// All reduced values are stored in the lowest possible lane. The set
1944/// of problems every algorithm addresses is a super set of those
1945/// addressable by algorithms with a lower version number. Overhead
1946/// increases as algorithm version increases.
1947///
1948/// Terminology
1949/// Reduce element:
1950/// Reduce element refers to the individual data field with primitive
1951/// data types to be combined and reduced across threads.
1952/// Reduce list:
1953/// Reduce list refers to a collection of local, thread-private
1954/// reduce elements.
1955/// Remote Reduce list:
1956/// Remote Reduce list refers to a collection of remote (relative to
1957/// the current thread) reduce elements.
1958///
1959/// We distinguish between three states of threads that are important to
1960/// the implementation of this function.
1961/// Alive threads:
1962/// Threads in a warp executing the SIMT instruction, as distinguished from
1963/// threads that are inactive due to divergent control flow.
1964/// Active threads:
1965/// The minimal set of threads that has to be alive upon entry to this
1966/// function. The computation is correct iff active threads are alive.
1967/// Some threads are alive but they are not active because they do not
1968/// contribute to the computation in any useful manner. Turning them off
1969/// may introduce control flow overheads without any tangible benefits.
1970/// Effective threads:
1971/// In order to comply with the argument requirements of the shuffle
1972/// function, we must keep all lanes holding data alive. But at most
1973/// half of them perform value aggregation; we refer to this half of
1974/// threads as effective. The other half is simply handing off their
1975/// data.
1976///
1977/// Procedure
1978/// Value shuffle:
1979/// In this step active threads transfer data from higher lane positions
1980/// in the warp to lower lane positions, creating Remote Reduce list.
1981/// Value aggregation:
1982/// In this step, effective threads combine their thread local Reduce list
1983/// with Remote Reduce list and store the result in the thread local
1984/// Reduce list.
1985/// Value copy:
1986/// In this step, we deal with the assumption made by algorithm 2
1987/// (i.e. contiguity assumption). When we have an odd number of lanes
1988/// active, say 2k+1, only k threads will be effective and therefore k
1989/// new values will be produced. However, the Reduce list owned by the
1990/// (2k+1)th thread is ignored in the value aggregation. Therefore
1991/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1992/// that the contiguity assumption still holds.
1993static llvm::Function *emitShuffleAndReduceFunction(
1994 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
1995 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
1996 ASTContext &C = CGM.getContext();
1997
1998 // Thread local Reduce list used to host the values of data to be reduced.
1999 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2000 C.VoidPtrTy, ImplicitParamKind::Other);
2001 // Current lane id; could be logical.
2002 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2003 ImplicitParamKind::Other);
2004 // Offset of the remote source lane relative to the current lane.
2005 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2006 C.ShortTy, ImplicitParamKind::Other);
2007 // Algorithm version. This is expected to be known at compile time.
2008 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2009 C.ShortTy, ImplicitParamKind::Other);
2010 FunctionArgList Args;
2011 Args.push_back(&ReduceListArg);
2012 Args.push_back(&LaneIDArg);
2013 Args.push_back(&RemoteLaneOffsetArg);
2014 Args.push_back(&AlgoVerArg);
2015
2016 const CGFunctionInfo &CGFI =
2017 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2018 auto *Fn = llvm::Function::Create(
2019 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2020 N: "_omp_reduction_shuffle_and_reduce_func", M: &CGM.getModule());
2021 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2022 Fn->setDoesNotRecurse();
2023
2024 CodeGenFunction CGF(CGM);
2025 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2026
2027 CGBuilderTy &Bld = CGF.Builder;
2028
2029 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2030 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2031 Address LocalReduceList(
2032 Bld.CreatePointerBitCastOrAddrSpaceCast(
2033 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2034 C.VoidPtrTy, SourceLocation()),
2035 ElemTy->getPointerTo()),
2036 ElemTy, CGF.getPointerAlign());
2037
2038 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2039 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2040 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2041
2042 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2043 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2044 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2045
2046 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2047 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2048 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2049
2050 // Create a local thread-private variable to host the Reduce list
2051 // from a remote lane.
2052 Address RemoteReduceList =
2053 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.remote_reduce_list");
2054
2055 // This loop iterates through the list of reduce elements and copies,
2056 // element by element, from a remote lane in the warp to RemoteReduceList,
2057 // hosted on the thread's stack.
2058 emitReductionListCopy(Action: RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2059 SrcBase: LocalReduceList, DestBase: RemoteReduceList,
2060 CopyOptions: {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2061 /*ScratchpadIndex=*/nullptr,
2062 /*ScratchpadWidth=*/nullptr});
2063
2064 // The actions to be performed on the Remote Reduce list is dependent
2065 // on the algorithm version.
2066 //
2067 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2068 // LaneId % 2 == 0 && Offset > 0):
2069 // do the reduction value aggregation
2070 //
2071 // The thread local variable Reduce list is mutated in place to host the
2072 // reduced data, which is the aggregated value produced from local and
2073 // remote lanes.
2074 //
2075 // Note that AlgoVer is expected to be a constant integer known at compile
2076 // time.
2077 // When AlgoVer==0, the first conjunction evaluates to true, making
2078 // the entire predicate true during compile time.
2079 // When AlgoVer==1, the second conjunction has only the second part to be
2080 // evaluated during runtime. Other conjunctions evaluates to false
2081 // during compile time.
2082 // When AlgoVer==2, the third conjunction has only the second part to be
2083 // evaluated during runtime. Other conjunctions evaluates to false
2084 // during compile time.
2085 llvm::Value *CondAlgo0 = Bld.CreateIsNull(Arg: AlgoVerArgVal);
2086
2087 llvm::Value *Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 1));
2088 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2089 LHS: Algo1, RHS: Bld.CreateICmpULT(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2090
2091 llvm::Value *Algo2 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 2));
2092 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2093 LHS: Algo2, RHS: Bld.CreateIsNull(Arg: Bld.CreateAnd(LHS: LaneIDArgVal, RHS: Bld.getInt16(C: 1))));
2094 CondAlgo2 = Bld.CreateAnd(
2095 LHS: CondAlgo2, RHS: Bld.CreateICmpSGT(LHS: RemoteLaneOffsetArgVal, RHS: Bld.getInt16(C: 0)));
2096
2097 llvm::Value *CondReduce = Bld.CreateOr(LHS: CondAlgo0, RHS: CondAlgo1);
2098 CondReduce = Bld.CreateOr(LHS: CondReduce, RHS: CondAlgo2);
2099
2100 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: "then");
2101 llvm::BasicBlock *ElseBB = CGF.createBasicBlock(name: "else");
2102 llvm::BasicBlock *MergeBB = CGF.createBasicBlock(name: "ifcont");
2103 Bld.CreateCondBr(Cond: CondReduce, True: ThenBB, False: ElseBB);
2104
2105 CGF.EmitBlock(BB: ThenBB);
2106 // reduce_function(LocalReduceList, RemoteReduceList)
2107 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2108 V: LocalReduceList.getPointer(), DestTy: CGF.VoidPtrTy);
2109 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2110 V: RemoteReduceList.getPointer(), DestTy: CGF.VoidPtrTy);
2111 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2112 CGF, Loc, OutlinedFn: ReduceFn, Args: {LocalReduceListPtr, RemoteReduceListPtr});
2113 Bld.CreateBr(Dest: MergeBB);
2114
2115 CGF.EmitBlock(BB: ElseBB);
2116 Bld.CreateBr(Dest: MergeBB);
2117
2118 CGF.EmitBlock(BB: MergeBB);
2119
2120 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2121 // Reduce list.
2122 Algo1 = Bld.CreateICmpEQ(LHS: AlgoVerArgVal, RHS: Bld.getInt16(C: 1));
2123 llvm::Value *CondCopy = Bld.CreateAnd(
2124 LHS: Algo1, RHS: Bld.CreateICmpUGE(LHS: LaneIDArgVal, RHS: RemoteLaneOffsetArgVal));
2125
2126 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock(name: "then");
2127 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock(name: "else");
2128 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock(name: "ifcont");
2129 Bld.CreateCondBr(Cond: CondCopy, True: CpyThenBB, False: CpyElseBB);
2130
2131 CGF.EmitBlock(BB: CpyThenBB);
2132 emitReductionListCopy(Action: ThreadCopy, CGF, ReductionArrayTy, Privates,
2133 SrcBase: RemoteReduceList, DestBase: LocalReduceList);
2134 Bld.CreateBr(Dest: CpyMergeBB);
2135
2136 CGF.EmitBlock(BB: CpyElseBB);
2137 Bld.CreateBr(Dest: CpyMergeBB);
2138
2139 CGF.EmitBlock(BB: CpyMergeBB);
2140
2141 CGF.FinishFunction();
2142 return Fn;
2143}
2144
2145/// This function emits a helper that copies all the reduction variables from
2146/// the team into the provided global buffer for the reduction variables.
2147///
2148/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2149/// For all data entries D in reduce_data:
2150/// Copy local D to buffer.D[Idx]
2151static llvm::Value *emitListToGlobalCopyFunction(
2152 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2153 QualType ReductionArrayTy, SourceLocation Loc,
2154 const RecordDecl *TeamReductionRec,
2155 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2156 &VarFieldMap) {
2157 ASTContext &C = CGM.getContext();
2158
2159 // Buffer: global reduction buffer.
2160 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2161 C.VoidPtrTy, ImplicitParamKind::Other);
2162 // Idx: index of the buffer.
2163 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2164 ImplicitParamKind::Other);
2165 // ReduceList: thread local Reduce list.
2166 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2167 C.VoidPtrTy, ImplicitParamKind::Other);
2168 FunctionArgList Args;
2169 Args.push_back(&BufferArg);
2170 Args.push_back(&IdxArg);
2171 Args.push_back(&ReduceListArg);
2172
2173 const CGFunctionInfo &CGFI =
2174 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2175 auto *Fn = llvm::Function::Create(
2176 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2177 N: "_omp_reduction_list_to_global_copy_func", M: &CGM.getModule());
2178 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2179 Fn->setDoesNotRecurse();
2180 CodeGenFunction CGF(CGM);
2181 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2182
2183 CGBuilderTy &Bld = CGF.Builder;
2184
2185 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2186 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2187 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2188 Address LocalReduceList(
2189 Bld.CreatePointerBitCastOrAddrSpaceCast(
2190 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2191 C.VoidPtrTy, Loc),
2192 ElemTy->getPointerTo()),
2193 ElemTy, CGF.getPointerAlign());
2194 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2195 llvm::Type *LLVMReductionsBufferTy =
2196 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2197 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2198 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2199 LLVMReductionsBufferTy->getPointerTo());
2200 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2201 /*Volatile=*/false, C.IntTy,
2202 Loc)};
2203 unsigned Idx = 0;
2204 for (const Expr *Private : Privates) {
2205 // Reduce element = LocalReduceList[i]
2206 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2207 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2208 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2209 // elemptr = ((CopyType*)(elemptrptr)) + I
2210 ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2211 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2212 V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2213 Address ElemPtr =
2214 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2215 const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2216 // Global = Buffer.VD[Idx];
2217 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2218 llvm::Value *BufferPtr =
2219 Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2220 LValue GlobLVal = CGF.EmitLValueForField(
2221 Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2222 Address GlobAddr = GlobLVal.getAddress(CGF);
2223 GlobLVal.setAddress(Address(GlobAddr.getPointer(),
2224 CGF.ConvertTypeForMem(T: Private->getType()),
2225 GlobAddr.getAlignment()));
2226 switch (CGF.getEvaluationKind(T: Private->getType())) {
2227 case TEK_Scalar: {
2228 llvm::Value *V = CGF.EmitLoadOfScalar(
2229 Addr: ElemPtr, /*Volatile=*/false, Ty: Private->getType(), Loc,
2230 BaseInfo: LValueBaseInfo(AlignmentSource::Type), TBAAInfo: TBAAAccessInfo());
2231 CGF.EmitStoreOfScalar(value: V, lvalue: GlobLVal);
2232 break;
2233 }
2234 case TEK_Complex: {
2235 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2236 src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()), loc: Loc);
2237 CGF.EmitStoreOfComplex(V, dest: GlobLVal, /*isInit=*/false);
2238 break;
2239 }
2240 case TEK_Aggregate:
2241 CGF.EmitAggregateCopy(Dest: GlobLVal,
2242 Src: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2243 EltTy: Private->getType(), MayOverlap: AggValueSlot::DoesNotOverlap);
2244 break;
2245 }
2246 ++Idx;
2247 }
2248
2249 CGF.FinishFunction();
2250 return Fn;
2251}
2252
2253/// This function emits a helper that reduces all the reduction variables from
2254/// the team into the provided global buffer for the reduction variables.
2255///
2256/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2257/// void *GlobPtrs[];
2258/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2259/// ...
2260/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2261/// reduce_function(GlobPtrs, reduce_data);
2262static llvm::Value *emitListToGlobalReduceFunction(
2263 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2264 QualType ReductionArrayTy, SourceLocation Loc,
2265 const RecordDecl *TeamReductionRec,
2266 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2267 &VarFieldMap,
2268 llvm::Function *ReduceFn) {
2269 ASTContext &C = CGM.getContext();
2270
2271 // Buffer: global reduction buffer.
2272 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2273 C.VoidPtrTy, ImplicitParamKind::Other);
2274 // Idx: index of the buffer.
2275 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2276 ImplicitParamKind::Other);
2277 // ReduceList: thread local Reduce list.
2278 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2279 C.VoidPtrTy, ImplicitParamKind::Other);
2280 FunctionArgList Args;
2281 Args.push_back(&BufferArg);
2282 Args.push_back(&IdxArg);
2283 Args.push_back(&ReduceListArg);
2284
2285 const CGFunctionInfo &CGFI =
2286 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2287 auto *Fn = llvm::Function::Create(
2288 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2289 N: "_omp_reduction_list_to_global_reduce_func", M: &CGM.getModule());
2290 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2291 Fn->setDoesNotRecurse();
2292 CodeGenFunction CGF(CGM);
2293 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2294
2295 CGBuilderTy &Bld = CGF.Builder;
2296
2297 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2298 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2299 llvm::Type *LLVMReductionsBufferTy =
2300 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2301 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2302 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2303 LLVMReductionsBufferTy->getPointerTo());
2304
2305 // 1. Build a list of reduction variables.
2306 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2307 Address ReductionList =
2308 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2309 auto IPriv = Privates.begin();
2310 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2311 /*Volatile=*/false, C.IntTy,
2312 Loc)};
2313 unsigned Idx = 0;
2314 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2315 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2316 // Global = Buffer.VD[Idx];
2317 const ValueDecl *VD = cast<DeclRefExpr>(Val: *IPriv)->getDecl();
2318 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2319 llvm::Value *BufferPtr =
2320 Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2321 LValue GlobLVal = CGF.EmitLValueForField(
2322 Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2323 Address GlobAddr = GlobLVal.getAddress(CGF);
2324 CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
2325 C.VoidPtrTy);
2326 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2327 // Store array size.
2328 ++Idx;
2329 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2330 llvm::Value *Size = CGF.Builder.CreateIntCast(
2331 V: CGF.getVLASize(
2332 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2333 .NumElts,
2334 DestTy: CGF.SizeTy, /*isSigned=*/false);
2335 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2336 Addr: Elem);
2337 }
2338 }
2339
2340 // Call reduce_function(GlobalReduceList, ReduceList)
2341 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2342 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2343 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2344 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2345 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2346 CGF, Loc, OutlinedFn: ReduceFn, Args: {GlobalReduceList, ReducedPtr});
2347 CGF.FinishFunction();
2348 return Fn;
2349}
2350
2351/// This function emits a helper that copies all the reduction variables from
2352/// the team into the provided global buffer for the reduction variables.
2353///
2354/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2355/// For all data entries D in reduce_data:
2356/// Copy buffer.D[Idx] to local D;
2357static llvm::Value *emitGlobalToListCopyFunction(
2358 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2359 QualType ReductionArrayTy, SourceLocation Loc,
2360 const RecordDecl *TeamReductionRec,
2361 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2362 &VarFieldMap) {
2363 ASTContext &C = CGM.getContext();
2364
2365 // Buffer: global reduction buffer.
2366 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2367 C.VoidPtrTy, ImplicitParamKind::Other);
2368 // Idx: index of the buffer.
2369 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2370 ImplicitParamKind::Other);
2371 // ReduceList: thread local Reduce list.
2372 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2373 C.VoidPtrTy, ImplicitParamKind::Other);
2374 FunctionArgList Args;
2375 Args.push_back(&BufferArg);
2376 Args.push_back(&IdxArg);
2377 Args.push_back(&ReduceListArg);
2378
2379 const CGFunctionInfo &CGFI =
2380 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2381 auto *Fn = llvm::Function::Create(
2382 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2383 N: "_omp_reduction_global_to_list_copy_func", M: &CGM.getModule());
2384 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2385 Fn->setDoesNotRecurse();
2386 CodeGenFunction CGF(CGM);
2387 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2388
2389 CGBuilderTy &Bld = CGF.Builder;
2390
2391 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2392 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2393 llvm::Type *ElemTy = CGF.ConvertTypeForMem(T: ReductionArrayTy);
2394 Address LocalReduceList(
2395 Bld.CreatePointerBitCastOrAddrSpaceCast(
2396 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2397 C.VoidPtrTy, Loc),
2398 ElemTy->getPointerTo()),
2399 ElemTy, CGF.getPointerAlign());
2400 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2401 llvm::Type *LLVMReductionsBufferTy =
2402 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2403 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2404 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2405 LLVMReductionsBufferTy->getPointerTo());
2406
2407 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2408 /*Volatile=*/false, C.IntTy,
2409 Loc)};
2410 unsigned Idx = 0;
2411 for (const Expr *Private : Privates) {
2412 // Reduce element = LocalReduceList[i]
2413 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(Addr: LocalReduceList, Index: Idx);
2414 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2415 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2416 // elemptr = ((CopyType*)(elemptrptr)) + I
2417 ElemTy = CGF.ConvertTypeForMem(T: Private->getType());
2418 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2419 V: ElemPtrPtr, DestTy: ElemTy->getPointerTo());
2420 Address ElemPtr =
2421 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(T: Private->getType()));
2422 const ValueDecl *VD = cast<DeclRefExpr>(Val: Private)->getDecl();
2423 // Global = Buffer.VD[Idx];
2424 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2425 llvm::Value *BufferPtr =
2426 Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2427 LValue GlobLVal = CGF.EmitLValueForField(
2428 Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2429 Address GlobAddr = GlobLVal.getAddress(CGF);
2430 GlobLVal.setAddress(Address(GlobAddr.getPointer(),
2431 CGF.ConvertTypeForMem(T: Private->getType()),
2432 GlobAddr.getAlignment()));
2433 switch (CGF.getEvaluationKind(T: Private->getType())) {
2434 case TEK_Scalar: {
2435 llvm::Value *V = CGF.EmitLoadOfScalar(lvalue: GlobLVal, Loc);
2436 CGF.EmitStoreOfScalar(Value: V, Addr: ElemPtr, /*Volatile=*/false, Ty: Private->getType(),
2437 BaseInfo: LValueBaseInfo(AlignmentSource::Type),
2438 TBAAInfo: TBAAAccessInfo());
2439 break;
2440 }
2441 case TEK_Complex: {
2442 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(src: GlobLVal, loc: Loc);
2443 CGF.EmitStoreOfComplex(V, dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2444 /*isInit=*/false);
2445 break;
2446 }
2447 case TEK_Aggregate:
2448 CGF.EmitAggregateCopy(Dest: CGF.MakeAddrLValue(Addr: ElemPtr, T: Private->getType()),
2449 Src: GlobLVal, EltTy: Private->getType(),
2450 MayOverlap: AggValueSlot::DoesNotOverlap);
2451 break;
2452 }
2453 ++Idx;
2454 }
2455
2456 CGF.FinishFunction();
2457 return Fn;
2458}
2459
2460/// This function emits a helper that reduces all the reduction variables from
2461/// the team into the provided global buffer for the reduction variables.
2462///
2463/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2464/// void *GlobPtrs[];
2465/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2466/// ...
2467/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2468/// reduce_function(reduce_data, GlobPtrs);
2469static llvm::Value *emitGlobalToListReduceFunction(
2470 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2471 QualType ReductionArrayTy, SourceLocation Loc,
2472 const RecordDecl *TeamReductionRec,
2473 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2474 &VarFieldMap,
2475 llvm::Function *ReduceFn) {
2476 ASTContext &C = CGM.getContext();
2477
2478 // Buffer: global reduction buffer.
2479 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2480 C.VoidPtrTy, ImplicitParamKind::Other);
2481 // Idx: index of the buffer.
2482 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2483 ImplicitParamKind::Other);
2484 // ReduceList: thread local Reduce list.
2485 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2486 C.VoidPtrTy, ImplicitParamKind::Other);
2487 FunctionArgList Args;
2488 Args.push_back(&BufferArg);
2489 Args.push_back(&IdxArg);
2490 Args.push_back(&ReduceListArg);
2491
2492 const CGFunctionInfo &CGFI =
2493 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2494 auto *Fn = llvm::Function::Create(
2495 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
2496 N: "_omp_reduction_global_to_list_reduce_func", M: &CGM.getModule());
2497 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
2498 Fn->setDoesNotRecurse();
2499 CodeGenFunction CGF(CGM);
2500 CGF.StartFunction(GD: GlobalDecl(), RetTy: C.VoidTy, Fn: Fn, FnInfo: CGFI, Args, Loc, StartLoc: Loc);
2501
2502 CGBuilderTy &Bld = CGF.Builder;
2503
2504 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2505 QualType StaticTy = C.getRecordType(Decl: TeamReductionRec);
2506 llvm::Type *LLVMReductionsBufferTy =
2507 CGM.getTypes().ConvertTypeForMem(T: StaticTy);
2508 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2509 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2510 LLVMReductionsBufferTy->getPointerTo());
2511
2512 // 1. Build a list of reduction variables.
2513 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2514 Address ReductionList =
2515 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2516 auto IPriv = Privates.begin();
2517 llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2518 /*Volatile=*/false, C.IntTy,
2519 Loc)};
2520 unsigned Idx = 0;
2521 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2522 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2523 // Global = Buffer.VD[Idx];
2524 const ValueDecl *VD = cast<DeclRefExpr>(Val: *IPriv)->getDecl();
2525 const FieldDecl *FD = VarFieldMap.lookup(Val: VD);
2526 llvm::Value *BufferPtr =
2527 Bld.CreateInBoundsGEP(Ty: LLVMReductionsBufferTy, Ptr: BufferArrPtr, IdxList: Idxs);
2528 LValue GlobLVal = CGF.EmitLValueForField(
2529 Base: CGF.MakeNaturalAlignAddrLValue(V: BufferPtr, T: StaticTy), Field: FD);
2530 Address GlobAddr = GlobLVal.getAddress(CGF);
2531 CGF.EmitStoreOfScalar(GlobAddr.getPointer(), Elem, /*Volatile=*/false,
2532 C.VoidPtrTy);
2533 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2534 // Store array size.
2535 ++Idx;
2536 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2537 llvm::Value *Size = CGF.Builder.CreateIntCast(
2538 V: CGF.getVLASize(
2539 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2540 .NumElts,
2541 DestTy: CGF.SizeTy, /*isSigned=*/false);
2542 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2543 Addr: Elem);
2544 }
2545 }
2546
2547 // Call reduce_function(ReduceList, GlobalReduceList)
2548 llvm::Value *GlobalReduceList = ReductionList.getPointer();
2549 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2550 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2551 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2552 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2553 CGF, Loc, OutlinedFn: ReduceFn, Args: {ReducedPtr, GlobalReduceList});
2554 CGF.FinishFunction();
2555 return Fn;
2556}
2557
2558///
2559/// Design of OpenMP reductions on the GPU
2560///
2561/// Consider a typical OpenMP program with one or more reduction
2562/// clauses:
2563///
2564/// float foo;
2565/// double bar;
2566/// #pragma omp target teams distribute parallel for \
2567/// reduction(+:foo) reduction(*:bar)
2568/// for (int i = 0; i < N; i++) {
2569/// foo += A[i]; bar *= B[i];
2570/// }
2571///
2572/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2573/// all teams. In our OpenMP implementation on the NVPTX device an
2574/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2575/// within a team are mapped to CUDA threads within a threadblock.
2576/// Our goal is to efficiently aggregate values across all OpenMP
2577/// threads such that:
2578///
2579/// - the compiler and runtime are logically concise, and
2580/// - the reduction is performed efficiently in a hierarchical
2581/// manner as follows: within OpenMP threads in the same warp,
2582/// across warps in a threadblock, and finally across teams on
2583/// the NVPTX device.
2584///
2585/// Introduction to Decoupling
2586///
2587/// We would like to decouple the compiler and the runtime so that the
2588/// latter is ignorant of the reduction variables (number, data types)
2589/// and the reduction operators. This allows a simpler interface
2590/// and implementation while still attaining good performance.
2591///
2592/// Pseudocode for the aforementioned OpenMP program generated by the
2593/// compiler is as follows:
2594///
2595/// 1. Create private copies of reduction variables on each OpenMP
2596/// thread: 'foo_private', 'bar_private'
2597/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2598/// to it and writes the result in 'foo_private' and 'bar_private'
2599/// respectively.
2600/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2601/// and store the result on the team master:
2602///
2603/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2604/// reduceData, shuffleReduceFn, interWarpCpyFn)
2605///
2606/// where:
2607/// struct ReduceData {
2608/// double *foo;
2609/// double *bar;
2610/// } reduceData
2611/// reduceData.foo = &foo_private
2612/// reduceData.bar = &bar_private
2613///
2614/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2615/// auxiliary functions generated by the compiler that operate on
2616/// variables of type 'ReduceData'. They aid the runtime perform
2617/// algorithmic steps in a data agnostic manner.
2618///
2619/// 'shuffleReduceFn' is a pointer to a function that reduces data
2620/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2621/// same warp. It takes the following arguments as input:
2622///
2623/// a. variable of type 'ReduceData' on the calling lane,
2624/// b. its lane_id,
2625/// c. an offset relative to the current lane_id to generate a
2626/// remote_lane_id. The remote lane contains the second
2627/// variable of type 'ReduceData' that is to be reduced.
2628/// d. an algorithm version parameter determining which reduction
2629/// algorithm to use.
2630///
2631/// 'shuffleReduceFn' retrieves data from the remote lane using
2632/// efficient GPU shuffle intrinsics and reduces, using the
2633/// algorithm specified by the 4th parameter, the two operands
2634/// element-wise. The result is written to the first operand.
2635///
2636/// Different reduction algorithms are implemented in different
2637/// runtime functions, all calling 'shuffleReduceFn' to perform
2638/// the essential reduction step. Therefore, based on the 4th
2639/// parameter, this function behaves slightly differently to
2640/// cooperate with the runtime to ensure correctness under
2641/// different circumstances.
2642///
2643/// 'InterWarpCpyFn' is a pointer to a function that transfers
2644/// reduced variables across warps. It tunnels, through CUDA
2645/// shared memory, the thread-private data of type 'ReduceData'
2646/// from lane 0 of each warp to a lane in the first warp.
2647/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2648/// The last team writes the global reduced value to memory.
2649///
2650/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2651/// reduceData, shuffleReduceFn, interWarpCpyFn,
2652/// scratchpadCopyFn, loadAndReduceFn)
2653///
2654/// 'scratchpadCopyFn' is a helper that stores reduced
2655/// data from the team master to a scratchpad array in
2656/// global memory.
2657///
2658/// 'loadAndReduceFn' is a helper that loads data from
2659/// the scratchpad array and reduces it with the input
2660/// operand.
2661///
2662/// These compiler generated functions hide address
2663/// calculation and alignment information from the runtime.
2664/// 5. if ret == 1:
2665/// The team master of the last team stores the reduced
2666/// result to the globals in memory.
2667/// foo += reduceData.foo; bar *= reduceData.bar
2668///
2669///
2670/// Warp Reduction Algorithms
2671///
2672/// On the warp level, we have three algorithms implemented in the
2673/// OpenMP runtime depending on the number of active lanes:
2674///
2675/// Full Warp Reduction
2676///
2677/// The reduce algorithm within a warp where all lanes are active
2678/// is implemented in the runtime as follows:
2679///
2680/// full_warp_reduce(void *reduce_data,
2681/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2682/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2683/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2684/// }
2685///
2686/// The algorithm completes in log(2, WARPSIZE) steps.
2687///
2688/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2689/// not used therefore we save instructions by not retrieving lane_id
2690/// from the corresponding special registers. The 4th parameter, which
2691/// represents the version of the algorithm being used, is set to 0 to
2692/// signify full warp reduction.
2693///
2694/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2695///
2696/// #reduce_elem refers to an element in the local lane's data structure
2697/// #remote_elem is retrieved from a remote lane
2698/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2699/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2700///
2701/// Contiguous Partial Warp Reduction
2702///
2703/// This reduce algorithm is used within a warp where only the first
2704/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2705/// number of OpenMP threads in a parallel region is not a multiple of
2706/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2707///
2708/// void
2709/// contiguous_partial_reduce(void *reduce_data,
2710/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2711/// int size, int lane_id) {
2712/// int curr_size;
2713/// int offset;
2714/// curr_size = size;
2715/// mask = curr_size/2;
2716/// while (offset>0) {
2717/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2718/// curr_size = (curr_size+1)/2;
2719/// offset = curr_size/2;
2720/// }
2721/// }
2722///
2723/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2724///
2725/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2726/// if (lane_id < offset)
2727/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2728/// else
2729/// reduce_elem = remote_elem
2730///
2731/// This algorithm assumes that the data to be reduced are located in a
2732/// contiguous subset of lanes starting from the first. When there is
2733/// an odd number of active lanes, the data in the last lane is not
2734/// aggregated with any other lane's dat but is instead copied over.
2735///
2736/// Dispersed Partial Warp Reduction
2737///
2738/// This algorithm is used within a warp when any discontiguous subset of
2739/// lanes are active. It is used to implement the reduction operation
2740/// across lanes in an OpenMP simd region or in a nested parallel region.
2741///
2742/// void
2743/// dispersed_partial_reduce(void *reduce_data,
2744/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2745/// int size, remote_id;
2746/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2747/// do {
2748/// remote_id = next_active_lane_id_right_after_me();
2749/// # the above function returns 0 of no active lane
2750/// # is present right after the current lane.
2751/// size = number_of_active_lanes_in_this_warp();
2752/// logical_lane_id /= 2;
2753/// ShuffleReduceFn(reduce_data, logical_lane_id,
2754/// remote_id-1-threadIdx.x, 2);
2755/// } while (logical_lane_id % 2 == 0 && size > 1);
2756/// }
2757///
2758/// There is no assumption made about the initial state of the reduction.
2759/// Any number of lanes (>=1) could be active at any position. The reduction
2760/// result is returned in the first active lane.
2761///
2762/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2763///
2764/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2765/// if (lane_id % 2 == 0 && offset > 0)
2766/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2767/// else
2768/// reduce_elem = remote_elem
2769///
2770///
2771/// Intra-Team Reduction
2772///
2773/// This function, as implemented in the runtime call
2774/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2775/// threads in a team. It first reduces within a warp using the
2776/// aforementioned algorithms. We then proceed to gather all such
2777/// reduced values at the first warp.
2778///
2779/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2780/// data from each of the "warp master" (zeroth lane of each warp, where
2781/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2782/// a mathematical sense) the problem of reduction across warp masters in
2783/// a block to the problem of warp reduction.
2784///
2785///
2786/// Inter-Team Reduction
2787///
2788/// Once a team has reduced its data to a single value, it is stored in
2789/// a global scratchpad array. Since each team has a distinct slot, this
2790/// can be done without locking.
2791///
2792/// The last team to write to the scratchpad array proceeds to reduce the
2793/// scratchpad array. One or more workers in the last team use the helper
2794/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2795/// the k'th worker reduces every k'th element.
2796///
2797/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2798/// reduce across workers and compute a globally reduced value.
2799///
2800void CGOpenMPRuntimeGPU::emitReduction(
2801 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2802 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2803 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2804 if (!CGF.HaveInsertPoint())
2805 return;
2806
2807 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2808#ifndef NDEBUG
2809 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2810#endif
2811
2812 if (Options.SimpleReduction) {
2813 assert(!TeamsReduction && !ParallelReduction &&
2814 "Invalid reduction selection in emitReduction.");
2815 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2816 ReductionOps, Options);
2817 return;
2818 }
2819
2820 assert((TeamsReduction || ParallelReduction) &&
2821 "Invalid reduction selection in emitReduction.");
2822
2823 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2824 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2825 int Cnt = 0;
2826 for (const Expr *DRE : Privates) {
2827 PrivatesReductions[Cnt] = cast<DeclRefExpr>(Val: DRE)->getDecl();
2828 ++Cnt;
2829 }
2830
2831 ASTContext &C = CGM.getContext();
2832 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2833 C&: CGM.getContext(), EscapedDecls: PrivatesReductions, EscapedDeclsForTeams: std::nullopt, MappedDeclsFields&: VarFieldMap, BufSize: 1);
2834
2835 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2836 // RedList, shuffle_reduce_func, interwarp_copy_func);
2837 // or
2838 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2839 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2840
2841 llvm::Value *Res;
2842 // 1. Build a list of reduction variables.
2843 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2844 auto Size = RHSExprs.size();
2845 for (const Expr *E : Privates) {
2846 if (E->getType()->isVariablyModifiedType())
2847 // Reserve place for array size.
2848 ++Size;
2849 }
2850 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2851 QualType ReductionArrayTy = C.getConstantArrayType(
2852 EltTy: C.VoidPtrTy, ArySize: ArraySize, SizeExpr: nullptr, ASM: ArraySizeModifier::Normal,
2853 /*IndexTypeQuals=*/0);
2854 Address ReductionList =
2855 CGF.CreateMemTemp(T: ReductionArrayTy, Name: ".omp.reduction.red_list");
2856 auto IPriv = Privates.begin();
2857 unsigned Idx = 0;
2858 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2859 Address Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2860 CGF.Builder.CreateStore(
2861 Val: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2862 V: CGF.EmitLValue(E: RHSExprs[I]).getPointer(CGF), DestTy: CGF.VoidPtrTy),
2863 Addr: Elem);
2864 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2865 // Store array size.
2866 ++Idx;
2867 Elem = CGF.Builder.CreateConstArrayGEP(Addr: ReductionList, Index: Idx);
2868 llvm::Value *Size = CGF.Builder.CreateIntCast(
2869 V: CGF.getVLASize(
2870 vla: CGF.getContext().getAsVariableArrayType(T: (*IPriv)->getType()))
2871 .NumElts,
2872 DestTy: CGF.SizeTy, /*isSigned=*/false);
2873 CGF.Builder.CreateStore(Val: CGF.Builder.CreateIntToPtr(V: Size, DestTy: CGF.VoidPtrTy),
2874 Addr: Elem);
2875 }
2876 }
2877
2878 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2879 V: ReductionList.getPointer(), DestTy: CGF.VoidPtrTy);
2880 llvm::Function *ReductionFn = emitReductionFunction(
2881 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(T: ReductionArrayTy),
2882 Privates, LHSExprs, RHSExprs, ReductionOps);
2883 llvm::Value *ReductionDataSize =
2884 CGF.getTypeSize(Ty: C.getRecordType(Decl: ReductionRec));
2885 ReductionDataSize =
2886 CGF.Builder.CreateSExtOrTrunc(V: ReductionDataSize, DestTy: CGF.Int64Ty);
2887 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2888 CGM, Privates, ReductionArrayTy, ReduceFn: ReductionFn, Loc);
2889 llvm::Value *InterWarpCopyFn =
2890 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2891
2892 if (ParallelReduction) {
2893 llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2894 InterWarpCopyFn};
2895
2896 Res = CGF.EmitRuntimeCall(
2897 callee: OMPBuilder.getOrCreateRuntimeFunction(
2898 M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2899 args: Args);
2900 } else {
2901 assert(TeamsReduction && "expected teams reduction.");
2902 TeamsReductions.push_back(Elt: ReductionRec);
2903 auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2904 callee: OMPBuilder.getOrCreateRuntimeFunction(
2905 M&: CGM.getModule(), FnID: OMPRTL___kmpc_reduction_get_fixed_buffer),
2906 args: {}, name: "_openmp_teams_reductions_buffer_$_$ptr");
2907 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2908 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2909 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2910 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2911 ReduceFn: ReductionFn);
2912 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2913 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap);
2914 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2915 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec: ReductionRec, VarFieldMap,
2916 ReduceFn: ReductionFn);
2917
2918 llvm::Value *Args[] = {
2919 RTLoc,
2920 KernelTeamsReductionPtr,
2921 CGF.Builder.getInt32(C: C.getLangOpts().OpenMPCUDAReductionBufNum),
2922 ReductionDataSize,
2923 RL,
2924 ShuffleAndReduceFn,
2925 InterWarpCopyFn,
2926 GlobalToBufferCpyFn,
2927 GlobalToBufferRedFn,
2928 BufferToGlobalCpyFn,
2929 BufferToGlobalRedFn};
2930
2931 Res = CGF.EmitRuntimeCall(
2932 callee: OMPBuilder.getOrCreateRuntimeFunction(
2933 M&: CGM.getModule(), FnID: OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2934 args: Args);
2935 }
2936
2937 // 5. Build if (res == 1)
2938 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(name: ".omp.reduction.done");
2939 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(name: ".omp.reduction.then");
2940 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2941 LHS: Res, RHS: llvm::ConstantInt::get(Ty: CGM.Int32Ty, /*V=*/1));
2942 CGF.Builder.CreateCondBr(Cond, True: ThenBB, False: ExitBB);
2943
2944 // 6. Build then branch: where we have reduced values in the master
2945 // thread in each team.
2946 // __kmpc_end_reduce{_nowait}(<gtid>);
2947 // break;
2948 CGF.EmitBlock(BB: ThenBB);
2949
2950 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2951 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2952 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2953 auto IPriv = Privates.begin();
2954 auto ILHS = LHSExprs.begin();
2955 auto IRHS = RHSExprs.begin();
2956 for (const Expr *E : ReductionOps) {
2957 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(Val: *ILHS),
2958 cast<DeclRefExpr>(Val: *IRHS));
2959 ++IPriv;
2960 ++ILHS;
2961 ++IRHS;
2962 }
2963 };
2964 RegionCodeGenTy RCG(CodeGen);
2965 RCG(CGF);
2966 // There is no need to emit line number for unconditional branch.
2967 (void)ApplyDebugLocation::CreateEmpty(CGF);
2968 CGF.EmitBlock(BB: ExitBB, /*IsFinished=*/true);
2969}
2970
2971const VarDecl *
2972CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
2973 const VarDecl *NativeParam) const {
2974 if (!NativeParam->getType()->isReferenceType())
2975 return NativeParam;
2976 QualType ArgType = NativeParam->getType();
2977 QualifierCollector QC;
2978 const Type *NonQualTy = QC.strip(type: ArgType);
2979 QualType PointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
2980 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2981 if (Attr->getCaptureKind() == OMPC_map) {
2982 PointeeTy = CGM.getContext().getAddrSpaceQualType(T: PointeeTy,
2983 AddressSpace: LangAS::opencl_global);
2984 }
2985 }
2986 ArgType = CGM.getContext().getPointerType(T: PointeeTy);
2987 QC.addRestrict();
2988 enum { NVPTX_local_addr = 5 };
2989 QC.addAddressSpace(space: getLangASFromTargetAS(TargetAS: NVPTX_local_addr));
2990 ArgType = QC.apply(Context: CGM.getContext(), QT: ArgType);
2991 if (isa<ImplicitParamDecl>(Val: NativeParam))
2992 return ImplicitParamDecl::Create(
2993 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2994 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
2995 return ParmVarDecl::Create(
2996 C&: CGM.getContext(),
2997 DC: const_cast<DeclContext *>(NativeParam->getDeclContext()),
2998 StartLoc: NativeParam->getBeginLoc(), IdLoc: NativeParam->getLocation(),
2999 Id: NativeParam->getIdentifier(), T: ArgType,
3000 /*TInfo=*/nullptr, S: SC_None, /*DefArg=*/nullptr);
3001}
3002
3003Address
3004CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3005 const VarDecl *NativeParam,
3006 const VarDecl *TargetParam) const {
3007 assert(NativeParam != TargetParam &&
3008 NativeParam->getType()->isReferenceType() &&
3009 "Native arg must not be the same as target arg.");
3010 Address LocalAddr = CGF.GetAddrOfLocalVar(VD: TargetParam);
3011 QualType NativeParamType = NativeParam->getType();
3012 QualifierCollector QC;
3013 const Type *NonQualTy = QC.strip(type: NativeParamType);
3014 QualType NativePointeeTy = cast<ReferenceType>(Val: NonQualTy)->getPointeeType();
3015 unsigned NativePointeeAddrSpace =
3016 CGF.getTypes().getTargetAddressSpace(T: NativePointeeTy);
3017 QualType TargetTy = TargetParam->getType();
3018 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(Addr: LocalAddr, /*Volatile=*/false,
3019 Ty: TargetTy, Loc: SourceLocation());
3020 // Cast to native address space.
3021 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3022 V: TargetAddr,
3023 DestTy: llvm::PointerType::get(C&: CGF.getLLVMContext(), AddressSpace: NativePointeeAddrSpace));
3024 Address NativeParamAddr = CGF.CreateMemTemp(T: NativeParamType);
3025 CGF.EmitStoreOfScalar(Value: TargetAddr, Addr: NativeParamAddr, /*Volatile=*/false,
3026 Ty: NativeParamType);
3027 return NativeParamAddr;
3028}
3029
3030void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3031 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3032 ArrayRef<llvm::Value *> Args) const {
3033 SmallVector<llvm::Value *, 4> TargetArgs;
3034 TargetArgs.reserve(N: Args.size());
3035 auto *FnType = OutlinedFn.getFunctionType();
3036 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3037 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3038 TargetArgs.append(in_start: std::next(x: Args.begin(), n: I), in_end: Args.end());
3039 break;
3040 }
3041 llvm::Type *TargetType = FnType->getParamType(i: I);
3042 llvm::Value *NativeArg = Args[I];
3043 if (!TargetType->isPointerTy()) {
3044 TargetArgs.emplace_back(Args&: NativeArg);
3045 continue;
3046 }
3047 TargetArgs.emplace_back(
3048 Args: CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(V: NativeArg, DestTy: TargetType));
3049 }
3050 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3051}
3052
3053/// Emit function which wraps the outline parallel region
3054/// and controls the arguments which are passed to this function.
3055/// The wrapper ensures that the outlined function is called
3056/// with the correct arguments when data is shared.
3057llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3058 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3059 ASTContext &Ctx = CGM.getContext();
3060 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3061
3062 // Create a function that takes as argument the source thread.
3063 FunctionArgList WrapperArgs;
3064 QualType Int16QTy =
3065 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3066 QualType Int32QTy =
3067 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3068 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3069 /*Id=*/nullptr, Int16QTy,
3070 ImplicitParamKind::Other);
3071 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3072 /*Id=*/nullptr, Int32QTy,
3073 ImplicitParamKind::Other);
3074 WrapperArgs.emplace_back(Args: &ParallelLevelArg);
3075 WrapperArgs.emplace_back(Args: &WrapperArg);
3076
3077 const CGFunctionInfo &CGFI =
3078 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3079
3080 auto *Fn = llvm::Function::Create(
3081 Ty: CGM.getTypes().GetFunctionType(Info: CGFI), Linkage: llvm::GlobalValue::InternalLinkage,
3082 N: Twine(OutlinedParallelFn->getName(), "_wrapper"), M: &CGM.getModule());
3083
3084 // Ensure we do not inline the function. This is trivially true for the ones
3085 // passed to __kmpc_fork_call but the ones calles in serialized regions
3086 // could be inlined. This is not a perfect but it is closer to the invariant
3087 // we want, namely, every data environment starts with a new function.
3088 // TODO: We should pass the if condition to the runtime function and do the
3089 // handling there. Much cleaner code.
3090 Fn->addFnAttr(llvm::Attribute::NoInline);
3091
3092 CGM.SetInternalFunctionAttributes(GD: GlobalDecl(), F: Fn, FI: CGFI);
3093 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3094 Fn->setDoesNotRecurse();
3095
3096 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3097 CGF.StartFunction(GD: GlobalDecl(), RetTy: Ctx.VoidTy, Fn: Fn, FnInfo: CGFI, Args: WrapperArgs,
3098 Loc: D.getBeginLoc(), StartLoc: D.getBeginLoc());
3099
3100 const auto *RD = CS.getCapturedRecordDecl();
3101 auto CurField = RD->field_begin();
3102
3103 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(Ty: CGF.Int32Ty,
3104 /*Name=*/".zero.addr");
3105 CGF.Builder.CreateStore(Val: CGF.Builder.getInt32(/*C*/ 0), Addr: ZeroAddr);
3106 // Get the array of arguments.
3107 SmallVector<llvm::Value *, 8> Args;
3108
3109 Args.emplace_back(Args: CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3110 Args.emplace_back(Args: ZeroAddr.getPointer());
3111
3112 CGBuilderTy &Bld = CGF.Builder;
3113 auto CI = CS.capture_begin();
3114
3115 // Use global memory for data sharing.
3116 // Handle passing of global args to workers.
3117 Address GlobalArgs =
3118 CGF.CreateDefaultAlignTempAlloca(Ty: CGF.VoidPtrPtrTy, Name: "global_args");
3119 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3120 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3121 CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3122 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_shared_variables),
3123 args: DataSharingArgs);
3124
3125 // Retrieve the shared variables from the list of references returned
3126 // by the runtime. Pass the variables to the outlined function.
3127 Address SharedArgListAddress = Address::invalid();
3128 if (CS.capture_size() > 0 ||
3129 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3130 SharedArgListAddress = CGF.EmitLoadOfPointer(
3131 Ptr: GlobalArgs, PtrTy: CGF.getContext()
3132 .getPointerType(CGF.getContext().VoidPtrTy)
3133 .castAs<PointerType>());
3134 }
3135 unsigned Idx = 0;
3136 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3137 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3138 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3139 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3140 llvm::Value *LB = CGF.EmitLoadOfScalar(
3141 Addr: TypedAddress,
3142 /*Volatile=*/false,
3143 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3144 Loc: cast<OMPLoopDirective>(Val: D).getLowerBoundVariable()->getExprLoc());
3145 Args.emplace_back(Args&: LB);
3146 ++Idx;
3147 Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: Idx);
3148 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3149 Addr: Src, Ty: CGF.SizeTy->getPointerTo(), ElementTy: CGF.SizeTy);
3150 llvm::Value *UB = CGF.EmitLoadOfScalar(
3151 Addr: TypedAddress,
3152 /*Volatile=*/false,
3153 Ty: CGF.getContext().getPointerType(T: CGF.getContext().getSizeType()),
3154 Loc: cast<OMPLoopDirective>(Val: D).getUpperBoundVariable()->getExprLoc());
3155 Args.emplace_back(Args&: UB);
3156 ++Idx;
3157 }
3158 if (CS.capture_size() > 0) {
3159 ASTContext &CGFContext = CGF.getContext();
3160 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3161 QualType ElemTy = CurField->getType();
3162 Address Src = Bld.CreateConstInBoundsGEP(Addr: SharedArgListAddress, Index: I + Idx);
3163 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3164 Addr: Src, Ty: CGF.ConvertTypeForMem(T: CGFContext.getPointerType(T: ElemTy)),
3165 ElementTy: CGF.ConvertTypeForMem(T: ElemTy));
3166 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3167 /*Volatile=*/false,
3168 CGFContext.getPointerType(T: ElemTy),
3169 CI->getLocation());
3170 if (CI->capturesVariableByCopy() &&
3171 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3172 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3173 CI->getLocation());
3174 }
3175 Args.emplace_back(Args&: Arg);
3176 }
3177 }
3178
3179 emitOutlinedFunctionCall(CGF, Loc: D.getBeginLoc(), OutlinedFn: OutlinedParallelFn, Args);
3180 CGF.FinishFunction();
3181 return Fn;
3182}
3183
3184void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3185 const Decl *D) {
3186 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3187 return;
3188
3189 assert(D && "Expected function or captured|block decl.");
3190 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3191 "Function is registered already.");
3192 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3193 "Team is set but not processed.");
3194 const Stmt *Body = nullptr;
3195 bool NeedToDelayGlobalization = false;
3196 if (const auto *FD = dyn_cast<FunctionDecl>(Val: D)) {
3197 Body = FD->getBody();
3198 } else if (const auto *BD = dyn_cast<BlockDecl>(Val: D)) {
3199 Body = BD->getBody();
3200 } else if (const auto *CD = dyn_cast<CapturedDecl>(Val: D)) {
3201 Body = CD->getBody();
3202 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3203 if (NeedToDelayGlobalization &&
3204 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3205 return;
3206 }
3207 if (!Body)
3208 return;
3209 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3210 VarChecker.Visit(Body);
3211 const RecordDecl *GlobalizedVarsRecord =
3212 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3213 TeamAndReductions.first = nullptr;
3214 TeamAndReductions.second.clear();
3215 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3216 VarChecker.getEscapedVariableLengthDecls();
3217 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3218 VarChecker.getDelayedVariableLengthDecls();
3219 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3220 DelayedVariableLengthDecls.empty())
3221 return;
3222 auto I = FunctionGlobalizedDecls.try_emplace(Key: CGF.CurFn).first;
3223 I->getSecond().MappedParams =
3224 std::make_unique<CodeGenFunction::OMPMapVars>();
3225 I->getSecond().EscapedParameters.insert(
3226 I: VarChecker.getEscapedParameters().begin(),
3227 E: VarChecker.getEscapedParameters().end());
3228 I->getSecond().EscapedVariableLengthDecls.append(
3229 in_start: EscapedVariableLengthDecls.begin(), in_end: EscapedVariableLengthDecls.end());
3230 I->getSecond().DelayedVariableLengthDecls.append(
3231 in_start: DelayedVariableLengthDecls.begin(), in_end: DelayedVariableLengthDecls.end());
3232 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3233 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3234 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3235 Data.insert(std::make_pair(x&: VD, y: MappedVarData()));
3236 }
3237 if (!NeedToDelayGlobalization) {
3238 emitGenericVarsProlog(CGF, Loc: D->getBeginLoc());
3239 struct GlobalizationScope final : EHScopeStack::Cleanup {
3240 GlobalizationScope() = default;
3241
3242 void Emit(CodeGenFunction &CGF, Flags flags) override {
3243 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3244 .emitGenericVarsEpilog(CGF);
3245 }
3246 };
3247 CGF.EHStack.pushCleanup<GlobalizationScope>(Kind: NormalAndEHCleanup);
3248 }
3249}
3250
3251Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3252 const VarDecl *VD) {
3253 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3254 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3255 auto AS = LangAS::Default;
3256 switch (A->getAllocatorType()) {
3257 // Use the default allocator here as by default local vars are
3258 // threadlocal.
3259 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3260 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3261 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3262 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3263 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3264 // Follow the user decision - use default allocation.
3265 return Address::invalid();
3266 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3267 // TODO: implement aupport for user-defined allocators.
3268 return Address::invalid();
3269 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3270 AS = LangAS::cuda_constant;
3271 break;
3272 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3273 AS = LangAS::cuda_shared;
3274 break;
3275 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3276 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3277 break;
3278 }
3279 llvm::Type *VarTy = CGF.ConvertTypeForMem(T: VD->getType());
3280 auto *GV = new llvm::GlobalVariable(
3281 CGM.getModule(), VarTy, /*isConstant=*/false,
3282 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(T: VarTy),
3283 VD->getName(),
3284 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3285 CGM.getContext().getTargetAddressSpace(AS));
3286 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3287 GV->setAlignment(Align.getAsAlign());
3288 return Address(
3289 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3290 GV, VarTy->getPointerTo(AddrSpace: CGM.getContext().getTargetAddressSpace(
3291 AS: VD->getType().getAddressSpace()))),
3292 VarTy, Align);
3293 }
3294
3295 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3296 return Address::invalid();
3297
3298 VD = VD->getCanonicalDecl();
3299 auto I = FunctionGlobalizedDecls.find(Val: CGF.CurFn);
3300 if (I == FunctionGlobalizedDecls.end())
3301 return Address::invalid();
3302 auto VDI = I->getSecond().LocalVarData.find(VD);
3303 if (VDI != I->getSecond().LocalVarData.end())
3304 return VDI->second.PrivateAddr;
3305 if (VD->hasAttrs()) {
3306 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3307 E(VD->attr_end());
3308 IT != E; ++IT) {
3309 auto VDI = I->getSecond().LocalVarData.find(
3310 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3311 ->getCanonicalDecl());
3312 if (VDI != I->getSecond().LocalVarData.end())
3313 return VDI->second.PrivateAddr;
3314 }
3315 }
3316
3317 return Address::invalid();
3318}
3319
3320void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3321 FunctionGlobalizedDecls.erase(Val: CGF.CurFn);
3322 CGOpenMPRuntime::functionFinished(CGF);
3323}
3324
3325void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3326 CodeGenFunction &CGF, const OMPLoopDirective &S,
3327 OpenMPDistScheduleClauseKind &ScheduleKind,
3328 llvm::Value *&Chunk) const {
3329 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3330 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3331 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3332 Chunk = CGF.EmitScalarConversion(
3333 Src: RT.getGPUNumThreads(CGF),
3334 SrcTy: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
3335 DstTy: S.getIterationVariable()->getType(), Loc: S.getBeginLoc());
3336 return;
3337 }
3338 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3339 CGF, S, ScheduleKind, Chunk);
3340}
3341
3342void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3343 CodeGenFunction &CGF, const OMPLoopDirective &S,
3344 OpenMPScheduleClauseKind &ScheduleKind,
3345 const Expr *&ChunkExpr) const {
3346 ScheduleKind = OMPC_SCHEDULE_static;
3347 // Chunk size is 1 in this case.
3348 llvm::APInt ChunkSize(32, 1);
3349 ChunkExpr = IntegerLiteral::Create(C: CGF.getContext(), V: ChunkSize,
3350 type: CGF.getContext().getIntTypeForBitwidth(DestWidth: 32, /*Signed=*/0),
3351 l: SourceLocation());
3352}
3353
3354void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3355 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3356 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
3357 " Expected target-based directive.");
3358 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3359 for (const CapturedStmt::Capture &C : CS->captures()) {
3360 // Capture variables captured by reference in lambdas for target-based
3361 // directives.
3362 if (!C.capturesVariable())
3363 continue;
3364 const VarDecl *VD = C.getCapturedVar();
3365 const auto *RD = VD->getType()
3366 .getCanonicalType()
3367 .getNonReferenceType()
3368 ->getAsCXXRecordDecl();
3369 if (!RD || !RD->isLambda())
3370 continue;
3371 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3372 LValue VDLVal;
3373 if (VD->getType().getCanonicalType()->isReferenceType())
3374 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3375 else
3376 VDLVal = CGF.MakeAddrLValue(
3377 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3378 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3379 FieldDecl *ThisCapture = nullptr;
3380 RD->getCaptureFields(Captures, ThisCapture);
3381 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3382 LValue ThisLVal =
3383 CGF.EmitLValueForFieldInitialization(Base: VDLVal, Field: ThisCapture);
3384 llvm::Value *CXXThis = CGF.LoadCXXThis();
3385 CGF.EmitStoreOfScalar(value: CXXThis, lvalue: ThisLVal);
3386 }
3387 for (const LambdaCapture &LC : RD->captures()) {
3388 if (LC.getCaptureKind() != LCK_ByRef)
3389 continue;
3390 const ValueDecl *VD = LC.getCapturedVar();
3391 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3392 // capturing structured bindings in lambdas yet.
3393 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3394 continue;
3395 auto It = Captures.find(VD);
3396 assert(It != Captures.end() && "Found lambda capture without field.");
3397 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3398 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3399 if (VD->getType().getCanonicalType()->isReferenceType())
3400 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3401 VD->getType().getCanonicalType())
3402 .getAddress(CGF);
3403 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3404 }
3405 }
3406}
3407
3408bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3409 LangAS &AS) {
3410 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3411 return false;
3412 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3413 switch(A->getAllocatorType()) {
3414 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3415 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3416 // Not supported, fallback to the default mem space.
3417 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3418 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3419 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3420 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3421 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3422 AS = LangAS::Default;
3423 return true;
3424 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3425 AS = LangAS::cuda_constant;
3426 return true;
3427 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3428 AS = LangAS::cuda_shared;
3429 return true;
3430 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3431 llvm_unreachable("Expected predefined allocator for the variables with the "
3432 "static storage.");
3433 }
3434 return false;
3435}
3436
3437// Get current CudaArch and ignore any unknown values
3438static CudaArch getCudaArch(CodeGenModule &CGM) {
3439 if (!CGM.getTarget().hasFeature(Feature: "ptx"))
3440 return CudaArch::UNKNOWN;
3441 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3442 if (Feature.getValue()) {
3443 CudaArch Arch = StringToCudaArch(S: Feature.getKey());
3444 if (Arch != CudaArch::UNKNOWN)
3445 return Arch;
3446 }
3447 }
3448 return CudaArch::UNKNOWN;
3449}
3450
3451/// Check to see if target architecture supports unified addressing which is
3452/// a restriction for OpenMP requires clause "unified_shared_memory".
3453void CGOpenMPRuntimeGPU::processRequiresDirective(
3454 const OMPRequiresDecl *D) {
3455 for (const OMPClause *Clause : D->clauselists()) {
3456 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3457 CudaArch Arch = getCudaArch(CGM);
3458 switch (Arch) {
3459 case CudaArch::SM_20:
3460 case CudaArch::SM_21:
3461 case CudaArch::SM_30:
3462 case CudaArch::SM_32:
3463 case CudaArch::SM_35:
3464 case CudaArch::SM_37:
3465 case CudaArch::SM_50:
3466 case CudaArch::SM_52:
3467 case CudaArch::SM_53: {
3468 SmallString<256> Buffer;
3469 llvm::raw_svector_ostream Out(Buffer);
3470 Out << "Target architecture " << CudaArchToString(A: Arch)
3471 << " does not support unified addressing";
3472 CGM.Error(loc: Clause->getBeginLoc(), error: Out.str());
3473 return;
3474 }
3475 case CudaArch::SM_60:
3476 case CudaArch::SM_61:
3477 case CudaArch::SM_62:
3478 case CudaArch::SM_70:
3479 case CudaArch::SM_72:
3480 case CudaArch::SM_75:
3481 case CudaArch::SM_80:
3482 case CudaArch::SM_86:
3483 case CudaArch::SM_87:
3484 case CudaArch::SM_89:
3485 case CudaArch::SM_90:
3486 case CudaArch::SM_90a:
3487 case CudaArch::GFX600:
3488 case CudaArch::GFX601:
3489 case CudaArch::GFX602:
3490 case CudaArch::GFX700:
3491 case CudaArch::GFX701:
3492 case CudaArch::GFX702:
3493 case CudaArch::GFX703:
3494 case CudaArch::GFX704:
3495 case CudaArch::GFX705:
3496 case CudaArch::GFX801:
3497 case CudaArch::GFX802:
3498 case CudaArch::GFX803:
3499 case CudaArch::GFX805:
3500 case CudaArch::GFX810:
3501 case CudaArch::GFX900:
3502 case CudaArch::GFX902:
3503 case CudaArch::GFX904:
3504 case CudaArch::GFX906:
3505 case CudaArch::GFX908:
3506 case CudaArch::GFX909:
3507 case CudaArch::GFX90a:
3508 case CudaArch::GFX90c:
3509 case CudaArch::GFX940:
3510 case CudaArch::GFX941:
3511 case CudaArch::GFX942:
3512 case CudaArch::GFX1010:
3513 case CudaArch::GFX1011:
3514 case CudaArch::GFX1012:
3515 case CudaArch::GFX1013:
3516 case CudaArch::GFX1030:
3517 case CudaArch::GFX1031:
3518 case CudaArch::GFX1032:
3519 case CudaArch::GFX1033:
3520 case CudaArch::GFX1034:
3521 case CudaArch::GFX1035:
3522 case CudaArch::GFX1036:
3523 case CudaArch::GFX1100:
3524 case CudaArch::GFX1101:
3525 case CudaArch::GFX1102:
3526 case CudaArch::GFX1103:
3527 case CudaArch::GFX1150:
3528 case CudaArch::GFX1151:
3529 case CudaArch::GFX1200:
3530 case CudaArch::GFX1201:
3531 case CudaArch::Generic:
3532 case CudaArch::UNUSED:
3533 case CudaArch::UNKNOWN:
3534 break;
3535 case CudaArch::LAST:
3536 llvm_unreachable("Unexpected Cuda arch.");
3537 }
3538 }
3539 }
3540 CGOpenMPRuntime::processRequiresDirective(D);
3541}
3542
3543llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3544 CGBuilderTy &Bld = CGF.Builder;
3545 llvm::Module *M = &CGF.CGM.getModule();
3546 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3547 llvm::Function *F = M->getFunction(Name: LocSize);
3548 if (!F) {
3549 F = llvm::Function::Create(
3550 Ty: llvm::FunctionType::get(Result: CGF.Int32Ty, Params: std::nullopt, isVarArg: false),
3551 Linkage: llvm::GlobalVariable::ExternalLinkage, N: LocSize, M: &CGF.CGM.getModule());
3552 }
3553 return Bld.CreateCall(Callee: F, Args: std::nullopt, Name: "nvptx_num_threads");
3554}
3555
3556llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3557 ArrayRef<llvm::Value *> Args{};
3558 return CGF.EmitRuntimeCall(
3559 callee: OMPBuilder.getOrCreateRuntimeFunction(
3560 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_hardware_thread_id_in_block),
3561 args: Args);
3562}
3563
3564llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3565 ArrayRef<llvm::Value *> Args{};
3566 return CGF.EmitRuntimeCall(callee: OMPBuilder.getOrCreateRuntimeFunction(
3567 M&: CGM.getModule(), FnID: OMPRTL___kmpc_get_warp_size),
3568 args: Args);
3569}
3570

source code of clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp