clang 20.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24
25using namespace clang;
26using namespace CodeGen;
27using namespace llvm::omp;
28
29namespace {
30/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31class NVPTXActionTy final : public PrePostActionTy {
32 llvm::FunctionCallee EnterCallee = nullptr;
34 llvm::FunctionCallee ExitCallee = nullptr;
36 bool Conditional = false;
37 llvm::BasicBlock *ContBlock = nullptr;
38
39public:
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 llvm::FunctionCallee ExitCallee,
43 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45 ExitArgs(ExitArgs), Conditional(Conditional) {}
46 void Enter(CodeGenFunction &CGF) override {
47 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48 if (Conditional) {
49 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51 ContBlock = CGF.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54 CGF.EmitBlock(ThenBlock);
55 }
56 }
57 void Done(CodeGenFunction &CGF) {
58 // Emit the rest of blocks/branches
59 CGF.EmitBranch(ContBlock);
60 CGF.EmitBlock(ContBlock, true);
61 }
62 void Exit(CodeGenFunction &CGF) override {
63 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
64 }
65};
66
67/// A class to track the execution mode when codegening directives within
68/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69/// to the target region and used by containing directives such as 'parallel'
70/// to emit optimized code.
71class ExecutionRuntimeModesRAII {
72private:
76
77public:
78 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 : ExecMode(ExecMode) {
81 SavedExecMode = ExecMode;
82 ExecMode = EntryMode;
83 }
84 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
85};
86
87static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
88 RefExpr = RefExpr->IgnoreParens();
89 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
90 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
91 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
92 Base = TempASE->getBase()->IgnoreParenImpCasts();
93 RefExpr = Base;
94 } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
95 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
96 while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
97 Base = TempOASE->getBase()->IgnoreParenImpCasts();
98 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
99 Base = TempASE->getBase()->IgnoreParenImpCasts();
100 RefExpr = Base;
101 }
102 RefExpr = RefExpr->IgnoreParenImpCasts();
103 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
104 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
105 const auto *ME = cast<MemberExpr>(RefExpr);
106 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
107}
108
109static RecordDecl *buildRecordForGlobalizedVars(
111 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
112 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
113 &MappedDeclsFields,
114 int BufSize) {
115 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
116 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
117 return nullptr;
118 SmallVector<VarsDataTy, 4> GlobalizedVars;
119 for (const ValueDecl *D : EscapedDecls)
120 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
121 for (const ValueDecl *D : EscapedDeclsForTeams)
122 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
123
124 // Build struct _globalized_locals_ty {
125 // /* globalized vars */[WarSize] align (decl_align)
126 // /* globalized vars */ for EscapedDeclsForTeams
127 // };
128 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
129 GlobalizedRD->startDefinition();
131 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
132 for (const auto &Pair : GlobalizedVars) {
133 const ValueDecl *VD = Pair.second;
134 QualType Type = VD->getType();
136 Type = C.getPointerType(Type.getNonReferenceType());
137 else
138 Type = Type.getNonReferenceType();
141 if (SingleEscaped.count(VD)) {
143 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
144 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
145 /*BW=*/nullptr, /*Mutable=*/false,
146 /*InitStyle=*/ICIS_NoInit);
147 Field->setAccess(AS_public);
148 if (VD->hasAttrs()) {
149 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
150 E(VD->getAttrs().end());
151 I != E; ++I)
152 Field->addAttr(*I);
153 }
154 } else {
155 if (BufSize > 1) {
156 llvm::APInt ArraySize(32, BufSize);
157 Type = C.getConstantArrayType(Type, ArraySize, nullptr,
158 ArraySizeModifier::Normal, 0);
159 }
161 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
162 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
163 /*BW=*/nullptr, /*Mutable=*/false,
164 /*InitStyle=*/ICIS_NoInit);
165 Field->setAccess(AS_public);
166 llvm::APInt Align(32, Pair.first.getQuantity());
167 Field->addAttr(AlignedAttr::CreateImplicit(
168 C, /*IsAlignmentExpr=*/true,
170 C.getIntTypeForBitwidth(32, /*Signed=*/0),
172 {}, AlignedAttr::GNU_aligned));
173 }
174 GlobalizedRD->addDecl(Field);
175 MappedDeclsFields.try_emplace(VD, Field);
176 }
177 GlobalizedRD->completeDefinition();
178 return GlobalizedRD;
179}
180
181/// Get the list of variables that can escape their declaration context.
182class CheckVarsEscapingDeclContext final
183 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
184 CodeGenFunction &CGF;
185 llvm::SetVector<const ValueDecl *> EscapedDecls;
186 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
187 llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
188 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
189 RecordDecl *GlobalizedRD = nullptr;
190 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
191 bool AllEscaped = false;
192 bool IsForCombinedParallelRegion = false;
193
194 void markAsEscaped(const ValueDecl *VD) {
195 // Do not globalize declare target variables.
196 if (!isa<VarDecl>(VD) ||
197 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
198 return;
199 VD = cast<ValueDecl>(VD->getCanonicalDecl());
200 // Use user-specified allocation.
201 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
202 return;
203 // Variables captured by value must be globalized.
204 bool IsCaptured = false;
205 if (auto *CSI = CGF.CapturedStmtInfo) {
206 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
207 // Check if need to capture the variable that was already captured by
208 // value in the outer region.
209 IsCaptured = true;
210 if (!IsForCombinedParallelRegion) {
211 if (!FD->hasAttrs())
212 return;
213 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
214 if (!Attr)
215 return;
216 if (((Attr->getCaptureKind() != OMPC_map) &&
217 !isOpenMPPrivate(Attr->getCaptureKind())) ||
218 ((Attr->getCaptureKind() == OMPC_map) &&
219 !FD->getType()->isAnyPointerType()))
220 return;
221 }
222 if (!FD->getType()->isReferenceType()) {
223 assert(!VD->getType()->isVariablyModifiedType() &&
224 "Parameter captured by value with variably modified type");
225 EscapedParameters.insert(VD);
226 } else if (!IsForCombinedParallelRegion) {
227 return;
228 }
229 }
230 }
231 if ((!CGF.CapturedStmtInfo ||
232 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
233 VD->getType()->isReferenceType())
234 // Do not globalize variables with reference type.
235 return;
236 if (VD->getType()->isVariablyModifiedType()) {
237 // If not captured at the target region level then mark the escaped
238 // variable as delayed.
239 if (IsCaptured)
240 EscapedVariableLengthDecls.insert(VD);
241 else
242 DelayedVariableLengthDecls.insert(VD);
243 } else
244 EscapedDecls.insert(VD);
245 }
246
247 void VisitValueDecl(const ValueDecl *VD) {
248 if (VD->getType()->isLValueReferenceType())
249 markAsEscaped(VD);
250 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
251 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
252 const bool SavedAllEscaped = AllEscaped;
253 AllEscaped = VD->getType()->isLValueReferenceType();
254 Visit(VarD->getInit());
255 AllEscaped = SavedAllEscaped;
256 }
257 }
258 }
259 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
260 ArrayRef<OMPClause *> Clauses,
261 bool IsCombinedParallelRegion) {
262 if (!S)
263 return;
264 for (const CapturedStmt::Capture &C : S->captures()) {
265 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
266 const ValueDecl *VD = C.getCapturedVar();
267 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
268 if (IsCombinedParallelRegion) {
269 // Check if the variable is privatized in the combined construct and
270 // those private copies must be shared in the inner parallel
271 // directive.
272 IsForCombinedParallelRegion = false;
273 for (const OMPClause *C : Clauses) {
274 if (!isOpenMPPrivate(C->getClauseKind()) ||
275 C->getClauseKind() == OMPC_reduction ||
276 C->getClauseKind() == OMPC_linear ||
277 C->getClauseKind() == OMPC_private)
278 continue;
280 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
281 Vars = PC->getVarRefs();
282 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
283 Vars = PC->getVarRefs();
284 else
285 llvm_unreachable("Unexpected clause.");
286 for (const auto *E : Vars) {
287 const Decl *D =
288 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
289 if (D == VD->getCanonicalDecl()) {
290 IsForCombinedParallelRegion = true;
291 break;
292 }
293 }
294 if (IsForCombinedParallelRegion)
295 break;
296 }
297 }
298 markAsEscaped(VD);
299 if (isa<OMPCapturedExprDecl>(VD))
300 VisitValueDecl(VD);
301 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
302 }
303 }
304 }
305
306 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
307 assert(!GlobalizedRD &&
308 "Record for globalized variables is built already.");
309 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
310 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
311 if (IsInTTDRegion)
312 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
313 else
314 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
315 GlobalizedRD = ::buildRecordForGlobalizedVars(
316 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
317 MappedDeclsFields, WarpSize);
318 }
319
320public:
321 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
322 ArrayRef<const ValueDecl *> TeamsReductions)
323 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
324 }
325 virtual ~CheckVarsEscapingDeclContext() = default;
326 void VisitDeclStmt(const DeclStmt *S) {
327 if (!S)
328 return;
329 for (const Decl *D : S->decls())
330 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
331 VisitValueDecl(VD);
332 }
333 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
334 if (!D)
335 return;
336 if (!D->hasAssociatedStmt())
337 return;
338 if (const auto *S =
339 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
340 // Do not analyze directives that do not actually require capturing,
341 // like `omp for` or `omp simd` directives.
343 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
344 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
345 VisitStmt(S->getCapturedStmt());
346 return;
347 }
348 VisitOpenMPCapturedStmt(
349 S, D->clauses(),
350 CaptureRegions.back() == OMPD_parallel &&
351 isOpenMPDistributeDirective(D->getDirectiveKind()));
352 }
353 }
354 void VisitCapturedStmt(const CapturedStmt *S) {
355 if (!S)
356 return;
357 for (const CapturedStmt::Capture &C : S->captures()) {
358 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
359 const ValueDecl *VD = C.getCapturedVar();
360 markAsEscaped(VD);
361 if (isa<OMPCapturedExprDecl>(VD))
362 VisitValueDecl(VD);
363 }
364 }
365 }
366 void VisitLambdaExpr(const LambdaExpr *E) {
367 if (!E)
368 return;
369 for (const LambdaCapture &C : E->captures()) {
370 if (C.capturesVariable()) {
371 if (C.getCaptureKind() == LCK_ByRef) {
372 const ValueDecl *VD = C.getCapturedVar();
373 markAsEscaped(VD);
374 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
375 VisitValueDecl(VD);
376 }
377 }
378 }
379 }
380 void VisitBlockExpr(const BlockExpr *E) {
381 if (!E)
382 return;
383 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
384 if (C.isByRef()) {
385 const VarDecl *VD = C.getVariable();
386 markAsEscaped(VD);
387 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
388 VisitValueDecl(VD);
389 }
390 }
391 }
392 void VisitCallExpr(const CallExpr *E) {
393 if (!E)
394 return;
395 for (const Expr *Arg : E->arguments()) {
396 if (!Arg)
397 continue;
398 if (Arg->isLValue()) {
399 const bool SavedAllEscaped = AllEscaped;
400 AllEscaped = true;
401 Visit(Arg);
402 AllEscaped = SavedAllEscaped;
403 } else {
404 Visit(Arg);
405 }
406 }
407 Visit(E->getCallee());
408 }
409 void VisitDeclRefExpr(const DeclRefExpr *E) {
410 if (!E)
411 return;
412 const ValueDecl *VD = E->getDecl();
413 if (AllEscaped)
414 markAsEscaped(VD);
415 if (isa<OMPCapturedExprDecl>(VD))
416 VisitValueDecl(VD);
417 else if (VD->isInitCapture())
418 VisitValueDecl(VD);
419 }
420 void VisitUnaryOperator(const UnaryOperator *E) {
421 if (!E)
422 return;
423 if (E->getOpcode() == UO_AddrOf) {
424 const bool SavedAllEscaped = AllEscaped;
425 AllEscaped = true;
426 Visit(E->getSubExpr());
427 AllEscaped = SavedAllEscaped;
428 } else {
429 Visit(E->getSubExpr());
430 }
431 }
432 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
433 if (!E)
434 return;
435 if (E->getCastKind() == CK_ArrayToPointerDecay) {
436 const bool SavedAllEscaped = AllEscaped;
437 AllEscaped = true;
438 Visit(E->getSubExpr());
439 AllEscaped = SavedAllEscaped;
440 } else {
441 Visit(E->getSubExpr());
442 }
443 }
444 void VisitExpr(const Expr *E) {
445 if (!E)
446 return;
447 bool SavedAllEscaped = AllEscaped;
448 if (!E->isLValue())
449 AllEscaped = false;
450 for (const Stmt *Child : E->children())
451 if (Child)
452 Visit(Child);
453 AllEscaped = SavedAllEscaped;
454 }
455 void VisitStmt(const Stmt *S) {
456 if (!S)
457 return;
458 for (const Stmt *Child : S->children())
459 if (Child)
460 Visit(Child);
461 }
462
463 /// Returns the record that handles all the escaped local variables and used
464 /// instead of their original storage.
465 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
466 if (!GlobalizedRD)
467 buildRecordForGlobalizedVars(IsInTTDRegion);
468 return GlobalizedRD;
469 }
470
471 /// Returns the field in the globalized record for the escaped variable.
472 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
473 assert(GlobalizedRD &&
474 "Record for globalized variables must be generated already.");
475 return MappedDeclsFields.lookup(VD);
476 }
477
478 /// Returns the list of the escaped local variables/parameters.
479 ArrayRef<const ValueDecl *> getEscapedDecls() const {
480 return EscapedDecls.getArrayRef();
481 }
482
483 /// Checks if the escaped local variable is actually a parameter passed by
484 /// value.
485 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
486 return EscapedParameters;
487 }
488
489 /// Returns the list of the escaped variables with the variably modified
490 /// types.
491 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
492 return EscapedVariableLengthDecls.getArrayRef();
493 }
494
495 /// Returns the list of the delayed variables with the variably modified
496 /// types.
497 ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
498 return DelayedVariableLengthDecls.getArrayRef();
499 }
500};
501} // anonymous namespace
502
504CGOpenMPRuntimeGPU::getExecutionMode() const {
505 return CurrentExecutionMode;
506}
507
509CGOpenMPRuntimeGPU::getDataSharingMode() const {
510 return CurrentDataSharingMode;
511}
512
513/// Check for inner (nested) SPMD construct, if any
515 const OMPExecutableDirective &D) {
516 const auto *CS = D.getInnermostCapturedStmt();
517 const auto *Body =
518 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
519 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
520
521 if (const auto *NestedDir =
522 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
523 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
524 switch (D.getDirectiveKind()) {
525 case OMPD_target:
526 if (isOpenMPParallelDirective(DKind))
527 return true;
528 if (DKind == OMPD_teams) {
529 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
530 /*IgnoreCaptured=*/true);
531 if (!Body)
532 return false;
533 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
534 if (const auto *NND =
535 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
536 DKind = NND->getDirectiveKind();
537 if (isOpenMPParallelDirective(DKind))
538 return true;
539 }
540 }
541 return false;
542 case OMPD_target_teams:
543 return isOpenMPParallelDirective(DKind);
544 case OMPD_target_simd:
545 case OMPD_target_parallel:
546 case OMPD_target_parallel_for:
547 case OMPD_target_parallel_for_simd:
548 case OMPD_target_teams_distribute:
549 case OMPD_target_teams_distribute_simd:
550 case OMPD_target_teams_distribute_parallel_for:
551 case OMPD_target_teams_distribute_parallel_for_simd:
552 case OMPD_parallel:
553 case OMPD_for:
554 case OMPD_parallel_for:
555 case OMPD_parallel_master:
556 case OMPD_parallel_sections:
557 case OMPD_for_simd:
558 case OMPD_parallel_for_simd:
559 case OMPD_cancel:
560 case OMPD_cancellation_point:
561 case OMPD_ordered:
562 case OMPD_threadprivate:
563 case OMPD_allocate:
564 case OMPD_task:
565 case OMPD_simd:
566 case OMPD_sections:
567 case OMPD_section:
568 case OMPD_single:
569 case OMPD_master:
570 case OMPD_critical:
571 case OMPD_taskyield:
572 case OMPD_barrier:
573 case OMPD_taskwait:
574 case OMPD_taskgroup:
575 case OMPD_atomic:
576 case OMPD_flush:
577 case OMPD_depobj:
578 case OMPD_scan:
579 case OMPD_teams:
580 case OMPD_target_data:
581 case OMPD_target_exit_data:
582 case OMPD_target_enter_data:
583 case OMPD_distribute:
584 case OMPD_distribute_simd:
585 case OMPD_distribute_parallel_for:
586 case OMPD_distribute_parallel_for_simd:
587 case OMPD_teams_distribute:
588 case OMPD_teams_distribute_simd:
589 case OMPD_teams_distribute_parallel_for:
590 case OMPD_teams_distribute_parallel_for_simd:
591 case OMPD_target_update:
592 case OMPD_declare_simd:
593 case OMPD_declare_variant:
594 case OMPD_begin_declare_variant:
595 case OMPD_end_declare_variant:
596 case OMPD_declare_target:
597 case OMPD_end_declare_target:
598 case OMPD_declare_reduction:
599 case OMPD_declare_mapper:
600 case OMPD_taskloop:
601 case OMPD_taskloop_simd:
602 case OMPD_master_taskloop:
603 case OMPD_master_taskloop_simd:
604 case OMPD_parallel_master_taskloop:
605 case OMPD_parallel_master_taskloop_simd:
606 case OMPD_requires:
607 case OMPD_unknown:
608 default:
609 llvm_unreachable("Unexpected directive.");
610 }
611 }
612
613 return false;
614}
615
617 const OMPExecutableDirective &D) {
618 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
619 switch (DirectiveKind) {
620 case OMPD_target:
621 case OMPD_target_teams:
622 return hasNestedSPMDDirective(Ctx, D);
623 case OMPD_target_parallel_loop:
624 case OMPD_target_parallel:
625 case OMPD_target_parallel_for:
626 case OMPD_target_parallel_for_simd:
627 case OMPD_target_teams_distribute_parallel_for:
628 case OMPD_target_teams_distribute_parallel_for_simd:
629 case OMPD_target_simd:
630 case OMPD_target_teams_distribute_simd:
631 return true;
632 case OMPD_target_teams_distribute:
633 return false;
634 case OMPD_target_teams_loop:
635 // Whether this is true or not depends on how the directive will
636 // eventually be emitted.
637 if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
638 return TTLD->canBeParallelFor();
639 return false;
640 case OMPD_parallel:
641 case OMPD_for:
642 case OMPD_parallel_for:
643 case OMPD_parallel_master:
644 case OMPD_parallel_sections:
645 case OMPD_for_simd:
646 case OMPD_parallel_for_simd:
647 case OMPD_cancel:
648 case OMPD_cancellation_point:
649 case OMPD_ordered:
650 case OMPD_threadprivate:
651 case OMPD_allocate:
652 case OMPD_task:
653 case OMPD_simd:
654 case OMPD_sections:
655 case OMPD_section:
656 case OMPD_single:
657 case OMPD_master:
658 case OMPD_critical:
659 case OMPD_taskyield:
660 case OMPD_barrier:
661 case OMPD_taskwait:
662 case OMPD_taskgroup:
663 case OMPD_atomic:
664 case OMPD_flush:
665 case OMPD_depobj:
666 case OMPD_scan:
667 case OMPD_teams:
668 case OMPD_target_data:
669 case OMPD_target_exit_data:
670 case OMPD_target_enter_data:
671 case OMPD_distribute:
672 case OMPD_distribute_simd:
673 case OMPD_distribute_parallel_for:
674 case OMPD_distribute_parallel_for_simd:
675 case OMPD_teams_distribute:
676 case OMPD_teams_distribute_simd:
677 case OMPD_teams_distribute_parallel_for:
678 case OMPD_teams_distribute_parallel_for_simd:
679 case OMPD_target_update:
680 case OMPD_declare_simd:
681 case OMPD_declare_variant:
682 case OMPD_begin_declare_variant:
683 case OMPD_end_declare_variant:
684 case OMPD_declare_target:
685 case OMPD_end_declare_target:
686 case OMPD_declare_reduction:
687 case OMPD_declare_mapper:
688 case OMPD_taskloop:
689 case OMPD_taskloop_simd:
690 case OMPD_master_taskloop:
691 case OMPD_master_taskloop_simd:
692 case OMPD_parallel_master_taskloop:
693 case OMPD_parallel_master_taskloop_simd:
694 case OMPD_requires:
695 case OMPD_unknown:
696 default:
697 break;
698 }
699 llvm_unreachable(
700 "Unknown programming model for OpenMP directive on NVPTX target.");
701}
702
703void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
704 StringRef ParentName,
705 llvm::Function *&OutlinedFn,
706 llvm::Constant *&OutlinedFnID,
707 bool IsOffloadEntry,
708 const RegionCodeGenTy &CodeGen) {
709 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
710 EntryFunctionState EST;
711 WrapperFunctionsMap.clear();
712
713 [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
714 assert(!IsBareKernel && "bare kernel should not be at generic mode");
715
716 // Emit target region as a standalone region.
717 class NVPTXPrePostActionTy : public PrePostActionTy {
718 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
720
721 public:
722 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
724 : EST(EST), D(D) {}
725 void Enter(CodeGenFunction &CGF) override {
726 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
727 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
728 // Skip target region initialization.
729 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
730 }
731 void Exit(CodeGenFunction &CGF) override {
732 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
734 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
735 }
736 } Action(EST, D);
737 CodeGen.setAction(Action);
738 IsInTTDRegion = true;
739 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
740 IsOffloadEntry, CodeGen);
741 IsInTTDRegion = false;
742}
743
744void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
745 CodeGenFunction &CGF,
746 EntryFunctionState &EST, bool IsSPMD) {
747 int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
748 MaxTeamsVal = -1;
749 computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
750 MinTeamsVal, MaxTeamsVal);
751
752 CGBuilderTy &Bld = CGF.Builder;
753 Bld.restoreIP(OMPBuilder.createTargetInit(
754 Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
755 if (!IsSPMD)
756 emitGenericVarsProlog(CGF, EST.Loc);
757}
758
759void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
760 EntryFunctionState &EST,
761 bool IsSPMD) {
762 if (!IsSPMD)
763 emitGenericVarsEpilog(CGF);
764
765 // This is temporary until we remove the fixed sized buffer.
767 RecordDecl *StaticRD = C.buildImplicitRecord(
768 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
769 StaticRD->startDefinition();
770 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
771 QualType RecTy = C.getRecordType(TeamReductionRec);
772 auto *Field = FieldDecl::Create(
773 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
774 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
775 /*BW=*/nullptr, /*Mutable=*/false,
776 /*InitStyle=*/ICIS_NoInit);
777 Field->setAccess(AS_public);
778 StaticRD->addDecl(Field);
779 }
780 StaticRD->completeDefinition();
781 QualType StaticTy = C.getRecordType(StaticRD);
782 llvm::Type *LLVMReductionsBufferTy =
783 CGM.getTypes().ConvertTypeForMem(StaticTy);
784 const auto &DL = CGM.getModule().getDataLayout();
785 uint64_t ReductionDataSize =
786 TeamsReductions.empty()
787 ? 0
788 : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
789 CGBuilderTy &Bld = CGF.Builder;
790 OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
791 C.getLangOpts().OpenMPCUDAReductionBufNum);
792 TeamsReductions.clear();
793}
794
795void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
796 StringRef ParentName,
797 llvm::Function *&OutlinedFn,
798 llvm::Constant *&OutlinedFnID,
799 bool IsOffloadEntry,
800 const RegionCodeGenTy &CodeGen) {
801 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
802 EntryFunctionState EST;
803
804 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
805
806 // Emit target region as a standalone region.
807 class NVPTXPrePostActionTy : public PrePostActionTy {
809 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
810 bool IsBareKernel;
811 DataSharingMode Mode;
813
814 public:
815 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
816 CGOpenMPRuntimeGPU::EntryFunctionState &EST,
817 bool IsBareKernel, const OMPExecutableDirective &D)
818 : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
819 Mode(RT.CurrentDataSharingMode), D(D) {}
820 void Enter(CodeGenFunction &CGF) override {
821 if (IsBareKernel) {
822 RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
823 return;
824 }
825 RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
826 // Skip target region initialization.
827 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
828 }
829 void Exit(CodeGenFunction &CGF) override {
830 if (IsBareKernel) {
831 RT.CurrentDataSharingMode = Mode;
832 return;
833 }
834 RT.clearLocThreadIdInsertPt(CGF);
835 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
836 }
837 } Action(*this, EST, IsBareKernel, D);
838 CodeGen.setAction(Action);
839 IsInTTDRegion = true;
840 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
841 IsOffloadEntry, CodeGen);
842 IsInTTDRegion = false;
843}
844
845void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
846 const OMPExecutableDirective &D, StringRef ParentName,
847 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
848 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
849 if (!IsOffloadEntry) // Nothing to do.
850 return;
851
852 assert(!ParentName.empty() && "Invalid target region parent name!");
853
855 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
856 if (Mode || IsBareKernel)
857 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
858 CodeGen);
859 else
860 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
861 CodeGen);
862}
863
865 : CGOpenMPRuntime(CGM) {
866 llvm::OpenMPIRBuilderConfig Config(
867 CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
868 CGM.getLangOpts().OpenMPOffloadMandatory,
869 /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
870 hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
871 OMPBuilder.setConfig(Config);
872
873 if (!CGM.getLangOpts().OpenMPIsTargetDevice)
874 llvm_unreachable("OpenMP can only handle device code.");
875
876 if (CGM.getLangOpts().OpenMPCUDAMode)
877 CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
878
879 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
880 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
881 return;
882
883 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
884 "__omp_rtl_debug_kind");
885 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
886 "__omp_rtl_assume_teams_oversubscription");
887 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
888 "__omp_rtl_assume_threads_oversubscription");
889 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
890 "__omp_rtl_assume_no_thread_state");
891 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
892 "__omp_rtl_assume_no_nested_parallelism");
893}
894
896 ProcBindKind ProcBind,
898 // Nothing to do.
899}
900
902 llvm::Value *NumThreads,
904 // Nothing to do.
905}
906
908 const Expr *NumTeams,
909 const Expr *ThreadLimit,
911
914 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
915 const RegionCodeGenTy &CodeGen) {
916 // Emit target region as a standalone region.
917 bool PrevIsInTTDRegion = IsInTTDRegion;
918 IsInTTDRegion = false;
919 auto *OutlinedFun =
921 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
922 IsInTTDRegion = PrevIsInTTDRegion;
923 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
924 llvm::Function *WrapperFun =
925 createParallelDataSharingWrapper(OutlinedFun, D);
926 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
927 }
928
929 return OutlinedFun;
930}
931
932/// Get list of lastprivate variables from the teams distribute ... or
933/// teams {distribute ...} directives.
934static void
937 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
938 "expected teams directive.");
939 const OMPExecutableDirective *Dir = &D;
940 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
942 Ctx,
943 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
944 /*IgnoreCaptured=*/true))) {
945 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
947 Dir = nullptr;
948 }
949 }
950 if (!Dir)
951 return;
952 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
953 for (const Expr *E : C->getVarRefs())
954 Vars.push_back(getPrivateItem(E));
955 }
956}
957
958/// Get list of reduction variables from the teams ... directives.
959static void
962 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
963 "expected teams directive.");
964 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
965 for (const Expr *E : C->privates())
966 Vars.push_back(getPrivateItem(E));
967 }
968}
969
972 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
973 const RegionCodeGenTy &CodeGen) {
975
976 const RecordDecl *GlobalizedRD = nullptr;
977 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
978 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
979 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
980 // Globalize team reductions variable unconditionally in all modes.
981 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
982 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
983 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
984 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
985 if (!LastPrivatesReductions.empty()) {
986 GlobalizedRD = ::buildRecordForGlobalizedVars(
987 CGM.getContext(), {}, LastPrivatesReductions, MappedDeclsFields,
988 WarpSize);
989 }
990 } else if (!LastPrivatesReductions.empty()) {
991 assert(!TeamAndReductions.first &&
992 "Previous team declaration is not expected.");
993 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
994 std::swap(TeamAndReductions.second, LastPrivatesReductions);
995 }
996
997 // Emit target region as a standalone region.
998 class NVPTXPrePostActionTy : public PrePostActionTy {
1000 const RecordDecl *GlobalizedRD;
1001 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1002 &MappedDeclsFields;
1003
1004 public:
1005 NVPTXPrePostActionTy(
1006 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1007 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1008 &MappedDeclsFields)
1009 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1010 MappedDeclsFields(MappedDeclsFields) {}
1011 void Enter(CodeGenFunction &CGF) override {
1012 auto &Rt =
1013 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1014 if (GlobalizedRD) {
1015 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1016 I->getSecond().MappedParams =
1017 std::make_unique<CodeGenFunction::OMPMapVars>();
1018 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1019 for (const auto &Pair : MappedDeclsFields) {
1020 assert(Pair.getFirst()->isCanonicalDecl() &&
1021 "Expected canonical declaration");
1022 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1023 }
1024 }
1025 Rt.emitGenericVarsProlog(CGF, Loc);
1026 }
1027 void Exit(CodeGenFunction &CGF) override {
1028 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1029 .emitGenericVarsEpilog(CGF);
1030 }
1031 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1032 CodeGen.setAction(Action);
1033 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1034 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1035
1036 return OutlinedFun;
1037}
1038
1039void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1041 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1042 return;
1043
1044 CGBuilderTy &Bld = CGF.Builder;
1045
1046 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1047 if (I == FunctionGlobalizedDecls.end())
1048 return;
1049
1050 for (auto &Rec : I->getSecond().LocalVarData) {
1051 const auto *VD = cast<VarDecl>(Rec.first);
1052 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1053 QualType VarTy = VD->getType();
1054
1055 // Get the local allocation of a firstprivate variable before sharing
1056 llvm::Value *ParValue;
1057 if (EscapedParam) {
1058 LValue ParLVal =
1059 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1060 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1061 }
1062
1063 // Allocate space for the variable to be globalized
1064 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1065 llvm::CallBase *VoidPtr =
1066 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1067 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1068 AllocArgs, VD->getName());
1069 // FIXME: We should use the variables actual alignment as an argument.
1070 VoidPtr->addRetAttr(llvm::Attribute::get(
1071 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1073
1074 // Cast the void pointer and get the address of the globalized variable.
1075 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1076 VoidPtr, Bld.getPtrTy(0), VD->getName() + "_on_stack");
1077 LValue VarAddr =
1078 CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1079 Rec.second.PrivateAddr = VarAddr.getAddress();
1080 Rec.second.GlobalizedVal = VoidPtr;
1081
1082 // Assign the local allocation to the newly globalized location.
1083 if (EscapedParam) {
1084 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1085 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1086 }
1087 if (auto *DI = CGF.getDebugInfo())
1088 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1089 }
1090
1091 for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1092 const auto *VD = cast<VarDecl>(ValueD);
1093 std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1094 getKmpcAllocShared(CGF, VD);
1095 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1096 LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1099 I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1100 }
1101 I->getSecond().MappedParams->apply(CGF);
1102}
1103
1105 const VarDecl *VD) const {
1106 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1107 if (I == FunctionGlobalizedDecls.end())
1108 return false;
1109
1110 // Check variable declaration is delayed:
1111 return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1112}
1113
1114std::pair<llvm::Value *, llvm::Value *>
1116 const VarDecl *VD) {
1117 CGBuilderTy &Bld = CGF.Builder;
1118
1119 // Compute size and alignment.
1120 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1121 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1122 Size = Bld.CreateNUWAdd(
1123 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1124 llvm::Value *AlignVal =
1125 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1126 Size = Bld.CreateUDiv(Size, AlignVal);
1127 Size = Bld.CreateNUWMul(Size, AlignVal);
1128
1129 // Allocate space for this VLA object to be globalized.
1130 llvm::Value *AllocArgs[] = {Size};
1131 llvm::CallBase *VoidPtr =
1132 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1133 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1134 AllocArgs, VD->getName());
1135 VoidPtr->addRetAttr(llvm::Attribute::get(
1136 CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1137
1138 return std::make_pair(VoidPtr, Size);
1139}
1140
1142 CodeGenFunction &CGF,
1143 const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1144 // Deallocate the memory for each globalized VLA object
1145 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1146 CGM.getModule(), OMPRTL___kmpc_free_shared),
1147 {AddrSizePair.first, AddrSizePair.second});
1148}
1149
1150void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1151 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1152 return;
1153
1154 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1155 if (I != FunctionGlobalizedDecls.end()) {
1156 // Deallocate the memory for each globalized VLA object that was
1157 // globalized in the prolog (i.e. emitGenericVarsProlog).
1158 for (const auto &AddrSizePair :
1159 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1160 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1161 CGM.getModule(), OMPRTL___kmpc_free_shared),
1162 {AddrSizePair.first, AddrSizePair.second});
1163 }
1164 // Deallocate the memory for each globalized value
1165 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1166 const auto *VD = cast<VarDecl>(Rec.first);
1167 I->getSecond().MappedParams->restore(CGF);
1168
1169 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1170 CGF.getTypeSize(VD->getType())};
1171 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1172 CGM.getModule(), OMPRTL___kmpc_free_shared),
1173 FreeArgs);
1174 }
1175 }
1176}
1177
1181 llvm::Function *OutlinedFn,
1182 ArrayRef<llvm::Value *> CapturedVars) {
1183 if (!CGF.HaveInsertPoint())
1184 return;
1185
1186 bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1187
1189 /*Name=*/".zero.addr");
1190 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1192 // We don't emit any thread id function call in bare kernel, but because the
1193 // outlined function has a pointer argument, we emit a nullptr here.
1194 if (IsBareKernel)
1195 OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1196 else
1197 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1198 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1199 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1200 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1201}
1202
1205 llvm::Function *OutlinedFn,
1206 ArrayRef<llvm::Value *> CapturedVars,
1207 const Expr *IfCond,
1208 llvm::Value *NumThreads) {
1209 if (!CGF.HaveInsertPoint())
1210 return;
1211
1212 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1213 NumThreads](CodeGenFunction &CGF,
1214 PrePostActionTy &Action) {
1215 CGBuilderTy &Bld = CGF.Builder;
1216 llvm::Value *NumThreadsVal = NumThreads;
1217 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1218 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1219 if (WFn)
1220 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1221 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1222
1223 // Create a private scope that will globalize the arguments
1224 // passed from the outside of the target region.
1225 // TODO: Is that needed?
1226 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1227
1228 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1229 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1230 "captured_vars_addrs");
1231 // There's something to share.
1232 if (!CapturedVars.empty()) {
1233 // Prepare for parallel region. Indicate the outlined function.
1234 ASTContext &Ctx = CGF.getContext();
1235 unsigned Idx = 0;
1236 for (llvm::Value *V : CapturedVars) {
1237 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1238 llvm::Value *PtrV;
1239 if (V->getType()->isIntegerTy())
1240 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1241 else
1243 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1244 Ctx.getPointerType(Ctx.VoidPtrTy));
1245 ++Idx;
1246 }
1247 }
1248
1249 llvm::Value *IfCondVal = nullptr;
1250 if (IfCond)
1251 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1252 /* isSigned */ false);
1253 else
1254 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1255
1256 if (!NumThreadsVal)
1257 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1258 else
1259 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1260
1261 assert(IfCondVal && "Expected a value");
1262 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1263 llvm::Value *Args[] = {
1264 RTLoc,
1265 getThreadID(CGF, Loc),
1266 IfCondVal,
1267 NumThreadsVal,
1268 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1269 FnPtr,
1270 ID,
1271 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1272 CGF.VoidPtrPtrTy),
1273 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1274 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1275 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1276 Args);
1277 };
1278
1279 RegionCodeGenTy RCG(ParallelGen);
1280 RCG(CGF);
1281}
1282
1283void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1284 // Always emit simple barriers!
1285 if (!CGF.HaveInsertPoint())
1286 return;
1287 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1288 // This function does not use parameters, so we can emit just default values.
1289 llvm::Value *Args[] = {
1290 llvm::ConstantPointerNull::get(
1291 cast<llvm::PointerType>(getIdentTyPointerTy())),
1292 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1293 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1294 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1295 Args);
1296}
1297
1300 OpenMPDirectiveKind Kind, bool,
1301 bool) {
1302 // Always emit simple barriers!
1303 if (!CGF.HaveInsertPoint())
1304 return;
1305 // Build call __kmpc_cancel_barrier(loc, thread_id);
1306 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1307 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1308 getThreadID(CGF, Loc)};
1309
1310 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1311 CGM.getModule(), OMPRTL___kmpc_barrier),
1312 Args);
1313}
1314
1316 CodeGenFunction &CGF, StringRef CriticalName,
1317 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1318 const Expr *Hint) {
1319 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1320 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1321 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1322 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1323 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1324
1325 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1326
1327 // Get the mask of active threads in the warp.
1328 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1329 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1330 // Fetch team-local id of the thread.
1331 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1332
1333 // Get the width of the team.
1334 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1335
1336 // Initialize the counter variable for the loop.
1337 QualType Int32Ty =
1338 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1339 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1340 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1341 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1342 /*isInit=*/true);
1343
1344 // Block checks if loop counter exceeds upper bound.
1345 CGF.EmitBlock(LoopBB);
1346 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1347 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1348 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1349
1350 // Block tests which single thread should execute region, and which threads
1351 // should go straight to synchronisation point.
1352 CGF.EmitBlock(TestBB);
1353 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1354 llvm::Value *CmpThreadToCounter =
1355 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1356 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1357
1358 // Block emits the body of the critical region.
1359 CGF.EmitBlock(BodyBB);
1360
1361 // Output the critical statement.
1362 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1363 Hint);
1364
1365 // After the body surrounded by the critical region, the single executing
1366 // thread will jump to the synchronisation point.
1367 // Block waits for all threads in current team to finish then increments the
1368 // counter variable and returns to the loop.
1369 CGF.EmitBlock(SyncBB);
1370 // Reconverge active threads in the warp.
1371 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1372 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1373 Mask);
1374
1375 llvm::Value *IncCounterVal =
1376 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1377 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1378 CGF.EmitBranch(LoopBB);
1379
1380 // Block that is reached when all threads in the team complete the region.
1381 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1382}
1383
1384/// Cast value to the specified type.
1385static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1386 QualType ValTy, QualType CastTy,
1388 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1389 "Cast type must sized.");
1390 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1391 "Val type must sized.");
1392 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1393 if (ValTy == CastTy)
1394 return Val;
1395 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1396 CGF.getContext().getTypeSizeInChars(CastTy))
1397 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1398 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1399 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1401 Address CastItem = CGF.CreateMemTemp(CastTy);
1402 Address ValCastItem = CastItem.withElementType(Val->getType());
1403 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1405 TBAAAccessInfo());
1406 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1408 TBAAAccessInfo());
1409}
1410
1411///
1412/// Design of OpenMP reductions on the GPU
1413///
1414/// Consider a typical OpenMP program with one or more reduction
1415/// clauses:
1416///
1417/// float foo;
1418/// double bar;
1419/// #pragma omp target teams distribute parallel for \
1420/// reduction(+:foo) reduction(*:bar)
1421/// for (int i = 0; i < N; i++) {
1422/// foo += A[i]; bar *= B[i];
1423/// }
1424///
1425/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1426/// all teams. In our OpenMP implementation on the NVPTX device an
1427/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1428/// within a team are mapped to CUDA threads within a threadblock.
1429/// Our goal is to efficiently aggregate values across all OpenMP
1430/// threads such that:
1431///
1432/// - the compiler and runtime are logically concise, and
1433/// - the reduction is performed efficiently in a hierarchical
1434/// manner as follows: within OpenMP threads in the same warp,
1435/// across warps in a threadblock, and finally across teams on
1436/// the NVPTX device.
1437///
1438/// Introduction to Decoupling
1439///
1440/// We would like to decouple the compiler and the runtime so that the
1441/// latter is ignorant of the reduction variables (number, data types)
1442/// and the reduction operators. This allows a simpler interface
1443/// and implementation while still attaining good performance.
1444///
1445/// Pseudocode for the aforementioned OpenMP program generated by the
1446/// compiler is as follows:
1447///
1448/// 1. Create private copies of reduction variables on each OpenMP
1449/// thread: 'foo_private', 'bar_private'
1450/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1451/// to it and writes the result in 'foo_private' and 'bar_private'
1452/// respectively.
1453/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1454/// and store the result on the team master:
1455///
1456/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1457/// reduceData, shuffleReduceFn, interWarpCpyFn)
1458///
1459/// where:
1460/// struct ReduceData {
1461/// double *foo;
1462/// double *bar;
1463/// } reduceData
1464/// reduceData.foo = &foo_private
1465/// reduceData.bar = &bar_private
1466///
1467/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1468/// auxiliary functions generated by the compiler that operate on
1469/// variables of type 'ReduceData'. They aid the runtime perform
1470/// algorithmic steps in a data agnostic manner.
1471///
1472/// 'shuffleReduceFn' is a pointer to a function that reduces data
1473/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1474/// same warp. It takes the following arguments as input:
1475///
1476/// a. variable of type 'ReduceData' on the calling lane,
1477/// b. its lane_id,
1478/// c. an offset relative to the current lane_id to generate a
1479/// remote_lane_id. The remote lane contains the second
1480/// variable of type 'ReduceData' that is to be reduced.
1481/// d. an algorithm version parameter determining which reduction
1482/// algorithm to use.
1483///
1484/// 'shuffleReduceFn' retrieves data from the remote lane using
1485/// efficient GPU shuffle intrinsics and reduces, using the
1486/// algorithm specified by the 4th parameter, the two operands
1487/// element-wise. The result is written to the first operand.
1488///
1489/// Different reduction algorithms are implemented in different
1490/// runtime functions, all calling 'shuffleReduceFn' to perform
1491/// the essential reduction step. Therefore, based on the 4th
1492/// parameter, this function behaves slightly differently to
1493/// cooperate with the runtime to ensure correctness under
1494/// different circumstances.
1495///
1496/// 'InterWarpCpyFn' is a pointer to a function that transfers
1497/// reduced variables across warps. It tunnels, through CUDA
1498/// shared memory, the thread-private data of type 'ReduceData'
1499/// from lane 0 of each warp to a lane in the first warp.
1500/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1501/// The last team writes the global reduced value to memory.
1502///
1503/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1504/// reduceData, shuffleReduceFn, interWarpCpyFn,
1505/// scratchpadCopyFn, loadAndReduceFn)
1506///
1507/// 'scratchpadCopyFn' is a helper that stores reduced
1508/// data from the team master to a scratchpad array in
1509/// global memory.
1510///
1511/// 'loadAndReduceFn' is a helper that loads data from
1512/// the scratchpad array and reduces it with the input
1513/// operand.
1514///
1515/// These compiler generated functions hide address
1516/// calculation and alignment information from the runtime.
1517/// 5. if ret == 1:
1518/// The team master of the last team stores the reduced
1519/// result to the globals in memory.
1520/// foo += reduceData.foo; bar *= reduceData.bar
1521///
1522///
1523/// Warp Reduction Algorithms
1524///
1525/// On the warp level, we have three algorithms implemented in the
1526/// OpenMP runtime depending on the number of active lanes:
1527///
1528/// Full Warp Reduction
1529///
1530/// The reduce algorithm within a warp where all lanes are active
1531/// is implemented in the runtime as follows:
1532///
1533/// full_warp_reduce(void *reduce_data,
1534/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1535/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1536/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1537/// }
1538///
1539/// The algorithm completes in log(2, WARPSIZE) steps.
1540///
1541/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1542/// not used therefore we save instructions by not retrieving lane_id
1543/// from the corresponding special registers. The 4th parameter, which
1544/// represents the version of the algorithm being used, is set to 0 to
1545/// signify full warp reduction.
1546///
1547/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1548///
1549/// #reduce_elem refers to an element in the local lane's data structure
1550/// #remote_elem is retrieved from a remote lane
1551/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1552/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1553///
1554/// Contiguous Partial Warp Reduction
1555///
1556/// This reduce algorithm is used within a warp where only the first
1557/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1558/// number of OpenMP threads in a parallel region is not a multiple of
1559/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1560///
1561/// void
1562/// contiguous_partial_reduce(void *reduce_data,
1563/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1564/// int size, int lane_id) {
1565/// int curr_size;
1566/// int offset;
1567/// curr_size = size;
1568/// mask = curr_size/2;
1569/// while (offset>0) {
1570/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1571/// curr_size = (curr_size+1)/2;
1572/// offset = curr_size/2;
1573/// }
1574/// }
1575///
1576/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1577///
1578/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1579/// if (lane_id < offset)
1580/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1581/// else
1582/// reduce_elem = remote_elem
1583///
1584/// This algorithm assumes that the data to be reduced are located in a
1585/// contiguous subset of lanes starting from the first. When there is
1586/// an odd number of active lanes, the data in the last lane is not
1587/// aggregated with any other lane's dat but is instead copied over.
1588///
1589/// Dispersed Partial Warp Reduction
1590///
1591/// This algorithm is used within a warp when any discontiguous subset of
1592/// lanes are active. It is used to implement the reduction operation
1593/// across lanes in an OpenMP simd region or in a nested parallel region.
1594///
1595/// void
1596/// dispersed_partial_reduce(void *reduce_data,
1597/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1598/// int size, remote_id;
1599/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1600/// do {
1601/// remote_id = next_active_lane_id_right_after_me();
1602/// # the above function returns 0 of no active lane
1603/// # is present right after the current lane.
1604/// size = number_of_active_lanes_in_this_warp();
1605/// logical_lane_id /= 2;
1606/// ShuffleReduceFn(reduce_data, logical_lane_id,
1607/// remote_id-1-threadIdx.x, 2);
1608/// } while (logical_lane_id % 2 == 0 && size > 1);
1609/// }
1610///
1611/// There is no assumption made about the initial state of the reduction.
1612/// Any number of lanes (>=1) could be active at any position. The reduction
1613/// result is returned in the first active lane.
1614///
1615/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1616///
1617/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1618/// if (lane_id % 2 == 0 && offset > 0)
1619/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1620/// else
1621/// reduce_elem = remote_elem
1622///
1623///
1624/// Intra-Team Reduction
1625///
1626/// This function, as implemented in the runtime call
1627/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1628/// threads in a team. It first reduces within a warp using the
1629/// aforementioned algorithms. We then proceed to gather all such
1630/// reduced values at the first warp.
1631///
1632/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1633/// data from each of the "warp master" (zeroth lane of each warp, where
1634/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1635/// a mathematical sense) the problem of reduction across warp masters in
1636/// a block to the problem of warp reduction.
1637///
1638///
1639/// Inter-Team Reduction
1640///
1641/// Once a team has reduced its data to a single value, it is stored in
1642/// a global scratchpad array. Since each team has a distinct slot, this
1643/// can be done without locking.
1644///
1645/// The last team to write to the scratchpad array proceeds to reduce the
1646/// scratchpad array. One or more workers in the last team use the helper
1647/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1648/// the k'th worker reduces every k'th element.
1649///
1650/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1651/// reduce across workers and compute a globally reduced value.
1652///
1656 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1657 if (!CGF.HaveInsertPoint())
1658 return;
1659
1660 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1661 bool DistributeReduction = isOpenMPDistributeDirective(Options.ReductionKind);
1662 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
1663
1665
1666 if (Options.SimpleReduction) {
1667 assert(!TeamsReduction && !ParallelReduction &&
1668 "Invalid reduction selection in emitReduction.");
1669 (void)ParallelReduction;
1670 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
1671 ReductionOps, Options);
1672 return;
1673 }
1674
1675 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
1676 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
1677 int Cnt = 0;
1678 for (const Expr *DRE : Privates) {
1679 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
1680 ++Cnt;
1681 }
1682 const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
1683 CGM.getContext(), PrivatesReductions, {}, VarFieldMap, 1);
1684
1685 if (TeamsReduction)
1686 TeamsReductions.push_back(ReductionRec);
1687
1688 // Source location for the ident struct
1689 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1690
1691 using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
1692 InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
1693 CGF.AllocaInsertPt->getIterator());
1694 InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
1695 CGF.Builder.GetInsertPoint());
1696 llvm::OpenMPIRBuilder::LocationDescription OmpLoc(
1697 CodeGenIP, CGF.SourceLocToDebugLoc(Loc));
1699
1701 unsigned Idx = 0;
1702 for (const Expr *Private : Privates) {
1703 llvm::Type *ElementType;
1704 llvm::Value *Variable;
1705 llvm::Value *PrivateVariable;
1706 llvm::OpenMPIRBuilder::ReductionGenAtomicCBTy AtomicReductionGen = nullptr;
1707 ElementType = CGF.ConvertTypeForMem(Private->getType());
1708 const auto *RHSVar =
1709 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[Idx])->getDecl());
1710 PrivateVariable = CGF.GetAddrOfLocalVar(RHSVar).emitRawPointer(CGF);
1711 const auto *LHSVar =
1712 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[Idx])->getDecl());
1713 Variable = CGF.GetAddrOfLocalVar(LHSVar).emitRawPointer(CGF);
1714 llvm::OpenMPIRBuilder::EvalKind EvalKind;
1715 switch (CGF.getEvaluationKind(Private->getType())) {
1716 case TEK_Scalar:
1717 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Scalar;
1718 break;
1719 case TEK_Complex:
1720 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Complex;
1721 break;
1722 case TEK_Aggregate:
1723 EvalKind = llvm::OpenMPIRBuilder::EvalKind::Aggregate;
1724 break;
1725 }
1726 auto ReductionGen = [&](InsertPointTy CodeGenIP, unsigned I,
1727 llvm::Value **LHSPtr, llvm::Value **RHSPtr,
1728 llvm::Function *NewFunc) {
1729 CGF.Builder.restoreIP(CodeGenIP);
1730 auto *CurFn = CGF.CurFn;
1731 CGF.CurFn = NewFunc;
1732
1733 *LHSPtr = CGF.GetAddrOfLocalVar(
1734 cast<VarDecl>(cast<DeclRefExpr>(LHSExprs[I])->getDecl()))
1735 .emitRawPointer(CGF);
1736 *RHSPtr = CGF.GetAddrOfLocalVar(
1737 cast<VarDecl>(cast<DeclRefExpr>(RHSExprs[I])->getDecl()))
1738 .emitRawPointer(CGF);
1739
1740 emitSingleReductionCombiner(CGF, ReductionOps[I], Privates[I],
1741 cast<DeclRefExpr>(LHSExprs[I]),
1742 cast<DeclRefExpr>(RHSExprs[I]));
1743
1744 CGF.CurFn = CurFn;
1745
1746 return InsertPointTy(CGF.Builder.GetInsertBlock(),
1747 CGF.Builder.GetInsertPoint());
1748 };
1749 ReductionInfos.emplace_back(llvm::OpenMPIRBuilder::ReductionInfo(
1750 ElementType, Variable, PrivateVariable, EvalKind,
1751 /*ReductionGen=*/nullptr, ReductionGen, AtomicReductionGen));
1752 Idx++;
1753 }
1754
1755 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
1756 OMPBuilder.createReductionsGPU(
1757 OmpLoc, AllocaIP, CodeGenIP, ReductionInfos, false, TeamsReduction,
1758 DistributeReduction, llvm::OpenMPIRBuilder::ReductionGenCBKind::Clang,
1759 CGF.getTarget().getGridValue(),
1760 C.getLangOpts().OpenMPCUDAReductionBufNum, RTLoc);
1761 assert(AfterIP && "unexpected error creating GPU reductions");
1762 CGF.Builder.restoreIP(*AfterIP);
1763 return;
1764}
1765
1766const VarDecl *
1768 const VarDecl *NativeParam) const {
1769 if (!NativeParam->getType()->isReferenceType())
1770 return NativeParam;
1771 QualType ArgType = NativeParam->getType();
1773 const Type *NonQualTy = QC.strip(ArgType);
1774 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1775 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
1776 if (Attr->getCaptureKind() == OMPC_map) {
1777 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
1779 }
1780 }
1781 ArgType = CGM.getContext().getPointerType(PointeeTy);
1782 QC.addRestrict();
1783 enum { NVPTX_local_addr = 5 };
1784 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
1785 ArgType = QC.apply(CGM.getContext(), ArgType);
1786 if (isa<ImplicitParamDecl>(NativeParam))
1788 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
1789 NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
1790 return ParmVarDecl::Create(
1791 CGM.getContext(),
1792 const_cast<DeclContext *>(NativeParam->getDeclContext()),
1793 NativeParam->getBeginLoc(), NativeParam->getLocation(),
1794 NativeParam->getIdentifier(), ArgType,
1795 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
1796}
1797
1798Address
1800 const VarDecl *NativeParam,
1801 const VarDecl *TargetParam) const {
1802 assert(NativeParam != TargetParam &&
1803 NativeParam->getType()->isReferenceType() &&
1804 "Native arg must not be the same as target arg.");
1805 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
1806 QualType NativeParamType = NativeParam->getType();
1808 const Type *NonQualTy = QC.strip(NativeParamType);
1809 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
1810 unsigned NativePointeeAddrSpace =
1811 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
1812 QualType TargetTy = TargetParam->getType();
1813 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
1814 TargetTy, SourceLocation());
1815 // Cast to native address space.
1817 TargetAddr,
1818 llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
1819 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
1820 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
1821 NativeParamType);
1822 return NativeParamAddr;
1823}
1824
1826 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
1827 ArrayRef<llvm::Value *> Args) const {
1829 TargetArgs.reserve(Args.size());
1830 auto *FnType = OutlinedFn.getFunctionType();
1831 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
1832 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
1833 TargetArgs.append(std::next(Args.begin(), I), Args.end());
1834 break;
1835 }
1836 llvm::Type *TargetType = FnType->getParamType(I);
1837 llvm::Value *NativeArg = Args[I];
1838 if (!TargetType->isPointerTy()) {
1839 TargetArgs.emplace_back(NativeArg);
1840 continue;
1841 }
1842 TargetArgs.emplace_back(
1843 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
1844 }
1845 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
1846}
1847
1848/// Emit function which wraps the outline parallel region
1849/// and controls the arguments which are passed to this function.
1850/// The wrapper ensures that the outlined function is called
1851/// with the correct arguments when data is shared.
1852llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
1853 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
1854 ASTContext &Ctx = CGM.getContext();
1855 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
1856
1857 // Create a function that takes as argument the source thread.
1858 FunctionArgList WrapperArgs;
1859 QualType Int16QTy =
1860 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
1861 QualType Int32QTy =
1862 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
1863 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1864 /*Id=*/nullptr, Int16QTy,
1866 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
1867 /*Id=*/nullptr, Int32QTy,
1869 WrapperArgs.emplace_back(&ParallelLevelArg);
1870 WrapperArgs.emplace_back(&WrapperArg);
1871
1872 const CGFunctionInfo &CGFI =
1874
1875 auto *Fn = llvm::Function::Create(
1876 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1877 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
1878
1879 // Ensure we do not inline the function. This is trivially true for the ones
1880 // passed to __kmpc_fork_call but the ones calles in serialized regions
1881 // could be inlined. This is not a perfect but it is closer to the invariant
1882 // we want, namely, every data environment starts with a new function.
1883 // TODO: We should pass the if condition to the runtime function and do the
1884 // handling there. Much cleaner code.
1885 Fn->addFnAttr(llvm::Attribute::NoInline);
1886
1888 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1889 Fn->setDoesNotRecurse();
1890
1891 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
1892 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
1893 D.getBeginLoc(), D.getBeginLoc());
1894
1895 const auto *RD = CS.getCapturedRecordDecl();
1896 auto CurField = RD->field_begin();
1897
1898 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1899 /*Name=*/".zero.addr");
1900 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1901 // Get the array of arguments.
1903
1904 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
1905 Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
1906
1907 CGBuilderTy &Bld = CGF.Builder;
1908 auto CI = CS.capture_begin();
1909
1910 // Use global memory for data sharing.
1911 // Handle passing of global args to workers.
1912 RawAddress GlobalArgs =
1913 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
1914 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
1915 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
1916 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1917 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
1918 DataSharingArgs);
1919
1920 // Retrieve the shared variables from the list of references returned
1921 // by the runtime. Pass the variables to the outlined function.
1922 Address SharedArgListAddress = Address::invalid();
1923 if (CS.capture_size() > 0 ||
1924 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1925 SharedArgListAddress = CGF.EmitLoadOfPointer(
1926 GlobalArgs, CGF.getContext()
1928 .castAs<PointerType>());
1929 }
1930 unsigned Idx = 0;
1931 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
1932 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1934 Src, Bld.getPtrTy(0), CGF.SizeTy);
1935 llvm::Value *LB = CGF.EmitLoadOfScalar(
1936 TypedAddress,
1937 /*Volatile=*/false,
1939 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
1940 Args.emplace_back(LB);
1941 ++Idx;
1942 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
1943 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(Src, Bld.getPtrTy(0),
1944 CGF.SizeTy);
1945 llvm::Value *UB = CGF.EmitLoadOfScalar(
1946 TypedAddress,
1947 /*Volatile=*/false,
1949 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
1950 Args.emplace_back(UB);
1951 ++Idx;
1952 }
1953 if (CS.capture_size() > 0) {
1954 ASTContext &CGFContext = CGF.getContext();
1955 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
1956 QualType ElemTy = CurField->getType();
1957 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
1959 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
1960 CGF.ConvertTypeForMem(ElemTy));
1961 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
1962 /*Volatile=*/false,
1963 CGFContext.getPointerType(ElemTy),
1964 CI->getLocation());
1965 if (CI->capturesVariableByCopy() &&
1966 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
1967 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
1968 CI->getLocation());
1969 }
1970 Args.emplace_back(Arg);
1971 }
1972 }
1973
1974 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
1975 CGF.FinishFunction();
1976 return Fn;
1977}
1978
1980 const Decl *D) {
1981 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1982 return;
1983
1984 assert(D && "Expected function or captured|block decl.");
1985 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
1986 "Function is registered already.");
1987 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
1988 "Team is set but not processed.");
1989 const Stmt *Body = nullptr;
1990 bool NeedToDelayGlobalization = false;
1991 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
1992 Body = FD->getBody();
1993 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
1994 Body = BD->getBody();
1995 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
1996 Body = CD->getBody();
1997 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
1998 if (NeedToDelayGlobalization &&
1999 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
2000 return;
2001 }
2002 if (!Body)
2003 return;
2004 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
2005 VarChecker.Visit(Body);
2006 const RecordDecl *GlobalizedVarsRecord =
2007 VarChecker.getGlobalizedRecord(IsInTTDRegion);
2008 TeamAndReductions.first = nullptr;
2009 TeamAndReductions.second.clear();
2010 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
2011 VarChecker.getEscapedVariableLengthDecls();
2012 ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
2013 VarChecker.getDelayedVariableLengthDecls();
2014 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
2015 DelayedVariableLengthDecls.empty())
2016 return;
2017 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2018 I->getSecond().MappedParams =
2019 std::make_unique<CodeGenFunction::OMPMapVars>();
2020 I->getSecond().EscapedParameters.insert(
2021 VarChecker.getEscapedParameters().begin(),
2022 VarChecker.getEscapedParameters().end());
2023 I->getSecond().EscapedVariableLengthDecls.append(
2024 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
2025 I->getSecond().DelayedVariableLengthDecls.append(
2026 DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
2027 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2028 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2029 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
2030 Data.insert(std::make_pair(VD, MappedVarData()));
2031 }
2032 if (!NeedToDelayGlobalization) {
2033 emitGenericVarsProlog(CGF, D->getBeginLoc());
2034 struct GlobalizationScope final : EHScopeStack::Cleanup {
2035 GlobalizationScope() = default;
2036
2037 void Emit(CodeGenFunction &CGF, Flags flags) override {
2038 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
2039 .emitGenericVarsEpilog(CGF);
2040 }
2041 };
2042 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
2043 }
2044}
2045
2047 const VarDecl *VD) {
2048 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
2049 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2050 auto AS = LangAS::Default;
2051 switch (A->getAllocatorType()) {
2052 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2053 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2054 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2055 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2056 break;
2057 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2058 return Address::invalid();
2059 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2060 // TODO: implement aupport for user-defined allocators.
2061 return Address::invalid();
2062 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2064 break;
2065 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2067 break;
2068 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2069 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2070 break;
2071 }
2072 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
2073 auto *GV = new llvm::GlobalVariable(
2074 CGM.getModule(), VarTy, /*isConstant=*/false,
2075 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
2076 VD->getName(),
2077 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
2079 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2080 GV->setAlignment(Align.getAsAlign());
2081 return Address(
2083 GV, CGF.Builder.getPtrTy(CGM.getContext().getTargetAddressSpace(
2084 VD->getType().getAddressSpace()))),
2085 VarTy, Align);
2086 }
2087
2088 if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
2089 return Address::invalid();
2090
2091 VD = VD->getCanonicalDecl();
2092 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2093 if (I == FunctionGlobalizedDecls.end())
2094 return Address::invalid();
2095 auto VDI = I->getSecond().LocalVarData.find(VD);
2096 if (VDI != I->getSecond().LocalVarData.end())
2097 return VDI->second.PrivateAddr;
2098 if (VD->hasAttrs()) {
2100 E(VD->attr_end());
2101 IT != E; ++IT) {
2102 auto VDI = I->getSecond().LocalVarData.find(
2103 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
2104 ->getCanonicalDecl());
2105 if (VDI != I->getSecond().LocalVarData.end())
2106 return VDI->second.PrivateAddr;
2107 }
2108 }
2109
2110 return Address::invalid();
2111}
2112
2114 FunctionGlobalizedDecls.erase(CGF.CurFn);
2116}
2117
2119 CodeGenFunction &CGF, const OMPLoopDirective &S,
2120 OpenMPDistScheduleClauseKind &ScheduleKind,
2121 llvm::Value *&Chunk) const {
2122 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2123 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
2124 ScheduleKind = OMPC_DIST_SCHEDULE_static;
2125 Chunk = CGF.EmitScalarConversion(
2126 RT.getGPUNumThreads(CGF),
2127 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2128 S.getIterationVariable()->getType(), S.getBeginLoc());
2129 return;
2130 }
2132 CGF, S, ScheduleKind, Chunk);
2133}
2134
2136 CodeGenFunction &CGF, const OMPLoopDirective &S,
2137 OpenMPScheduleClauseKind &ScheduleKind,
2138 const Expr *&ChunkExpr) const {
2139 ScheduleKind = OMPC_SCHEDULE_static;
2140 // Chunk size is 1 in this case.
2141 llvm::APInt ChunkSize(32, 1);
2142 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
2143 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
2144 SourceLocation());
2145}
2146
2148 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
2149 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
2150 " Expected target-based directive.");
2151 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
2152 for (const CapturedStmt::Capture &C : CS->captures()) {
2153 // Capture variables captured by reference in lambdas for target-based
2154 // directives.
2155 if (!C.capturesVariable())
2156 continue;
2157 const VarDecl *VD = C.getCapturedVar();
2158 const auto *RD = VD->getType()
2162 if (!RD || !RD->isLambda())
2163 continue;
2164 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
2165 LValue VDLVal;
2167 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
2168 else
2169 VDLVal = CGF.MakeAddrLValue(
2170 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
2171 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
2172 FieldDecl *ThisCapture = nullptr;
2173 RD->getCaptureFields(Captures, ThisCapture);
2174 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
2175 LValue ThisLVal =
2176 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
2177 llvm::Value *CXXThis = CGF.LoadCXXThis();
2178 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
2179 }
2180 for (const LambdaCapture &LC : RD->captures()) {
2181 if (LC.getCaptureKind() != LCK_ByRef)
2182 continue;
2183 const ValueDecl *VD = LC.getCapturedVar();
2184 // FIXME: For now VD is always a VarDecl because OpenMP does not support
2185 // capturing structured bindings in lambdas yet.
2186 if (!CS->capturesVariable(cast<VarDecl>(VD)))
2187 continue;
2188 auto It = Captures.find(VD);
2189 assert(It != Captures.end() && "Found lambda capture without field.");
2190 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
2191 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
2193 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
2194 VD->getType().getCanonicalType())
2195 .getAddress();
2196 CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
2197 }
2198 }
2199}
2200
2202 LangAS &AS) {
2203 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
2204 return false;
2205 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
2206 switch(A->getAllocatorType()) {
2207 case OMPAllocateDeclAttr::OMPNullMemAlloc:
2208 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
2209 // Not supported, fallback to the default mem space.
2210 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
2211 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
2212 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
2213 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
2214 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
2215 AS = LangAS::Default;
2216 return true;
2217 case OMPAllocateDeclAttr::OMPConstMemAlloc:
2219 return true;
2220 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
2222 return true;
2223 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
2224 llvm_unreachable("Expected predefined allocator for the variables with the "
2225 "static storage.");
2226 }
2227 return false;
2228}
2229
2230// Get current OffloadArch and ignore any unknown values
2232 if (!CGM.getTarget().hasFeature("ptx"))
2233 return OffloadArch::UNKNOWN;
2234 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
2235 if (Feature.getValue()) {
2236 OffloadArch Arch = StringToOffloadArch(Feature.getKey());
2237 if (Arch != OffloadArch::UNKNOWN)
2238 return Arch;
2239 }
2240 }
2241 return OffloadArch::UNKNOWN;
2242}
2243
2244/// Check to see if target architecture supports unified addressing which is
2245/// a restriction for OpenMP requires clause "unified_shared_memory".
2247 for (const OMPClause *Clause : D->clauselists()) {
2248 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
2250 switch (Arch) {
2251 case OffloadArch::SM_20:
2252 case OffloadArch::SM_21:
2253 case OffloadArch::SM_30:
2255 case OffloadArch::SM_35:
2256 case OffloadArch::SM_37:
2257 case OffloadArch::SM_50:
2258 case OffloadArch::SM_52:
2259 case OffloadArch::SM_53: {
2260 SmallString<256> Buffer;
2261 llvm::raw_svector_ostream Out(Buffer);
2262 Out << "Target architecture " << OffloadArchToString(Arch)
2263 << " does not support unified addressing";
2264 CGM.Error(Clause->getBeginLoc(), Out.str());
2265 return;
2266 }
2267 case OffloadArch::SM_60:
2268 case OffloadArch::SM_61:
2269 case OffloadArch::SM_62:
2270 case OffloadArch::SM_70:
2271 case OffloadArch::SM_72:
2272 case OffloadArch::SM_75:
2273 case OffloadArch::SM_80:
2274 case OffloadArch::SM_86:
2275 case OffloadArch::SM_87:
2276 case OffloadArch::SM_89:
2277 case OffloadArch::SM_90:
2337 break;
2338 case OffloadArch::LAST:
2339 llvm_unreachable("Unexpected GPU arch.");
2340 }
2341 }
2342 }
2344}
2345
2347 CGBuilderTy &Bld = CGF.Builder;
2348 llvm::Module *M = &CGF.CGM.getModule();
2349 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
2350 llvm::Function *F = M->getFunction(LocSize);
2351 if (!F) {
2352 F = llvm::Function::Create(llvm::FunctionType::get(CGF.Int32Ty, {}, false),
2353 llvm::GlobalVariable::ExternalLinkage, LocSize,
2354 &CGF.CGM.getModule());
2355 }
2356 return Bld.CreateCall(F, {}, "nvptx_num_threads");
2357}
2358
2361 return CGF.EmitRuntimeCall(
2362 OMPBuilder.getOrCreateRuntimeFunction(
2363 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
2364 Args);
2365}
#define V(N, I)
Definition: ASTContext.h:3443
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ... or teams {distribute ....
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static OffloadArch getOffloadArch(CodeGenModule &CGM)
const Decl * D
Expr * E
This file defines OpenMP nodes for declarative directives.
This file defines OpenMP AST classes for clauses.
VarDecl * Variable
Definition: SemaObjC.cpp:757
SourceLocation Loc
Definition: SemaObjC.cpp:759
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:188
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1187
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1160
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:799
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:43
A class which contains all the information about a particular captured value.
Definition: Decl.h:4480
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6414
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2874
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3797
This captures a statement into a function.
Definition: Stmt.h:3784
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1438
capture_range captures()
Definition: Stmt.h:3922
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:128
static Address invalid()
Definition: Address.h:176
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:251
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:274
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:136
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:203
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ... produce name = getelementptr inbounds addr, i64 0, i64 index where i64 is a...
Definition: CGBuilder.h:241
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ... produce name = getelementptr inbounds addr, i64 index where i64 is actually the t...
Definition: CGBuilder.h:261
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_CUDA
CUDA data sharing mode.
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args={}) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_NonSPMD
Non-SPMD execution mode (1 master thread, others are workers).
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D, CodeGenFunction &CGF, int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal, int32_t &MaxTeamsVal)
Helper to determine the min/max number of threads/teams for D.
virtual void emitTargetOutlinedFunctionHelper(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, const RegionCodeGenTy &CodeGen)
Helper to emit outlined function for 'target' directive.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args={}) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
CGCapturedStmtInfo * CapturedStmtInfo
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
llvm::Type * ConvertTypeForMem(QualType T)
llvm::AssertingVH< llvm::Instruction > AllocaInsertPt
AllocaInsertPoint - This is an instruction in the entry block before which we prefer to insert alloca...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
const TargetInfo & getTarget() const
llvm::DebugLoc SourceLocToDebugLoc(SourceLocation Location)
Converts Location to a DebugLoc, if debug information is enabled.
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
CodeGenTypes & getTypes() const
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
llvm::LLVMContext & getLLVMContext()
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
This class organizes the cross-function state that is used while generating LLVM code.
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
llvm::Module & getModule() const
const LangOptions & getLangOpts() const
const TargetInfo & getTarget() const
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
ASTContext & getContext() const
llvm::LLVMContext & getLLVMContext()
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1630
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:679
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:382
LValue - This represents an lvalue references.
Definition: CGValue.h:182
Address getAddress() const
Definition: CGValue.h:361
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
An abstract representation of an aligned address.
Definition: Address.h:42
llvm::Value * getPointer() const
Definition: Address.h:66
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1435
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1768
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1265
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1519
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
T * getAttr() const
Definition: DeclBase.h:576
bool hasAttrs() const
Definition: DeclBase.h:521
attr_iterator attr_end() const
Definition: DeclBase.h:545
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:973
attr_iterator attr_begin() const
Definition: DeclBase.h:542
SourceLocation getLocation() const
Definition: DeclBase.h:442
DeclContext * getDeclContext()
Definition: DeclBase.h:451
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:434
AttrVec & getAttrs()
Definition: DeclBase.h:527
bool hasAttr() const
Definition: DeclBase.h:580
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:967
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:786
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3090
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3086
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277
Represents a member of a struct/union/class.
Definition: Decl.h:3033
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4555
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3724
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5402
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:973
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1954
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:577
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:274
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:280
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:556
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:446
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1004
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2922
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3198
A (possibly-)qualified type.
Definition: Type.h:929
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:8057
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:8134
QualType getCanonicalType() const
Definition: Type.h:7983
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7871
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7878
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4420
void addAddressSpace(LangAS space)
Definition: Type.h:590
void addRestrict()
Definition: Type.h:473
Represents a struct/union/class.
Definition: Decl.h:4148
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5104
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
Encodes a location in the source.
RetTy Visit(PTR(Stmt) S, ParamTys... P)
Definition: StmtVisitor.h:44
Stmt - This represents one statement.
Definition: Stmt.h:84
child_range children()
Definition: Stmt.cpp:294
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4760
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:752
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:311
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1660
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1493
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1828
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1916
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:8550
bool isReferenceType() const
Definition: Type.h:8204
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:738
bool isLValueReferenceType() const
Definition: Type.h:8208
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2220
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2724
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2232
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:671
QualType getType() const
Definition: Decl.h:682
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5394
Represents a variable declaration or definition.
Definition: Decl.h:882
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2246
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1522
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:35
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
The JSON file list parser is used to communicate input to InstallAPI.
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:25
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:272
OffloadArch
Definition: Cuda.h:56
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:250
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:104
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
OffloadArch StringToOffloadArch(llvm::StringRef S)
Definition: Cuda.cpp:180
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
const char * OffloadArchToString(OffloadArch A)
Definition: Cuda.cpp:162
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ CXXThis
Parameter for C++ 'this' argument.
@ Other
Other implicit parameter.
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:31
@ AS_public
Definition: Specifiers.h:124
int int32_t
unsigned long uint64_t