1//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Implementation of the interface to be used by Clang during the codegen of a
10// target region.
11//
12//===----------------------------------------------------------------------===//
13
14#include "omptarget.h"
15#include "OffloadPolicy.h"
16#include "OpenMP/OMPT/Callback.h"
17#include "OpenMP/OMPT/Interface.h"
18#include "PluginManager.h"
19#include "Shared/Debug.h"
20#include "Shared/EnvironmentVar.h"
21#include "Shared/Utils.h"
22#include "device.h"
23#include "private.h"
24#include "rtl.h"
25
26#include "Shared/Profile.h"
27
28#include "OpenMP/Mapping.h"
29#include "OpenMP/omp.h"
30
31#include "llvm/ADT/StringExtras.h"
32#include "llvm/ADT/bit.h"
33#include "llvm/Object/ObjectFile.h"
34
35#include <cassert>
36#include <cstdint>
37#include <vector>
38
39using llvm::SmallVector;
40#ifdef OMPT_SUPPORT
41using namespace llvm::omp::target::ompt;
42#endif
43
44int AsyncInfoTy::synchronize() {
45 int Result = OFFLOAD_SUCCESS;
46 if (!isQueueEmpty()) {
47 switch (SyncType) {
48 case SyncTy::BLOCKING:
49 // If we have a queue we need to synchronize it now.
50 Result = Device.synchronize(AsyncInfo&: *this);
51 assert(AsyncInfo.Queue == nullptr &&
52 "The device plugin should have nulled the queue to indicate there "
53 "are no outstanding actions!");
54 break;
55 case SyncTy::NON_BLOCKING:
56 Result = Device.queryAsync(AsyncInfo&: *this);
57 break;
58 }
59 }
60
61 // Run any pending post-processing function registered on this async object.
62 if (Result == OFFLOAD_SUCCESS && isQueueEmpty())
63 Result = runPostProcessing();
64
65 return Result;
66}
67
68void *&AsyncInfoTy::getVoidPtrLocation() {
69 BufferLocations.push_back(x: nullptr);
70 return BufferLocations.back();
71}
72
73bool AsyncInfoTy::isDone() const { return isQueueEmpty(); }
74
75int32_t AsyncInfoTy::runPostProcessing() {
76 size_t Size = PostProcessingFunctions.size();
77 for (size_t I = 0; I < Size; ++I) {
78 const int Result = PostProcessingFunctions[I]();
79 if (Result != OFFLOAD_SUCCESS)
80 return Result;
81 }
82
83 // Clear the vector up until the last known function, since post-processing
84 // procedures might add new procedures themselves.
85 const auto *PrevBegin = PostProcessingFunctions.begin();
86 PostProcessingFunctions.erase(CS: PrevBegin, CE: PrevBegin + Size);
87
88 return OFFLOAD_SUCCESS;
89}
90
91bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; }
92
93/* All begin addresses for partially mapped structs must be aligned, up to 16,
94 * in order to ensure proper alignment of members. E.g.
95 *
96 * struct S {
97 * int a; // 4-aligned
98 * int b; // 4-aligned
99 * int *p; // 8-aligned
100 * } s1;
101 * ...
102 * #pragma omp target map(tofrom: s1.b, s1.p[0:N])
103 * {
104 * s1.b = 5;
105 * for (int i...) s1.p[i] = ...;
106 * }
107 *
108 * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and
109 * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100,
110 * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment
111 * requirements for its type. Now, when we allocate memory on the device, in
112 * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned.
113 * This means that the chunk of the struct on the device will start at a
114 * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and
115 * address of p will be a misaligned 0x204 (on the host there was no need to add
116 * padding between b and p, so p comes exactly 4 bytes after b). If the device
117 * kernel tries to access s1.p, a misaligned address error occurs (as reported
118 * by the CUDA plugin). By padding the begin address down to a multiple of 8 and
119 * extending the size of the allocated chuck accordingly, the chuck on the
120 * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and
121 * &s1.p=0x208, as they should be to satisfy the alignment requirements.
122 */
123static const int64_t MaxAlignment = 16;
124
125/// Return the alignment requirement of partially mapped structs, see
126/// MaxAlignment above.
127static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) {
128 int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase));
129 uint64_t BaseAlignment = 1 << (LowestOneBit - 1);
130 return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment;
131}
132
133/// Map global data and execute pending ctors
134static int initLibrary(DeviceTy &Device) {
135 /*
136 * Map global data
137 */
138 int32_t DeviceId = Device.DeviceID;
139 int Rc = OFFLOAD_SUCCESS;
140 {
141 std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx);
142 for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) {
143 TranslationTable *TransTable =
144 &PM->HostEntriesBeginToTransTable[HostEntriesBegin];
145 if (TransTable->HostTable.EntriesBegin ==
146 TransTable->HostTable.EntriesEnd) {
147 // No host entry so no need to proceed
148 continue;
149 }
150
151 if (TransTable->TargetsTable[DeviceId] != 0) {
152 // Library entries have already been processed
153 continue;
154 }
155
156 // 1) get image.
157 assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&
158 "Not expecting a device ID outside the table's bounds!");
159 __tgt_device_image *Img = TransTable->TargetsImages[DeviceId];
160 if (!Img) {
161 REPORT("No image loaded for device id %d.\n", DeviceId);
162 Rc = OFFLOAD_FAIL;
163 break;
164 }
165
166 // 2) Load the image onto the given device.
167 auto BinaryOrErr = Device.loadBinary(Img);
168 if (llvm::Error Err = BinaryOrErr.takeError()) {
169 REPORT("Failed to load image %s\n",
170 llvm::toString(std::move(Err)).c_str());
171 Rc = OFFLOAD_FAIL;
172 break;
173 }
174
175 // 3) Create the translation table.
176 llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
177 TransTable->TargetsEntries[DeviceId];
178 for (__tgt_offload_entry &Entry :
179 llvm::make_range(x: Img->EntriesBegin, y: Img->EntriesEnd)) {
180 __tgt_device_binary &Binary = *BinaryOrErr;
181
182 __tgt_offload_entry DeviceEntry = Entry;
183 if (Entry.size) {
184 if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
185 &DeviceEntry.addr) != OFFLOAD_SUCCESS)
186 REPORT("Failed to load symbol %s\n", Entry.name);
187
188 // If unified memory is active, the corresponding global is a device
189 // reference to the host global. We need to initialize the pointer on
190 // the deive to point to the memory on the host.
191 if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) ||
192 (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
193 if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
194 Entry.size) != OFFLOAD_SUCCESS)
195 REPORT("Failed to write symbol for USM %s\n", Entry.name);
196 }
197 } else {
198 if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
199 OFFLOAD_SUCCESS)
200 REPORT("Failed to load kernel %s\n", Entry.name);
201 }
202 DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
203 DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
204 DPxPTR(DeviceEntry.addr));
205
206 DeviceEntries.emplace_back(Args&: DeviceEntry);
207 }
208
209 // Set the storage for the table and get a pointer to it.
210 __tgt_target_table DeviceTable{.EntriesBegin: &DeviceEntries[0],
211 .EntriesEnd: &DeviceEntries[0] + DeviceEntries.size()};
212 TransTable->DeviceTables[DeviceId] = DeviceTable;
213 __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
214 &TransTable->DeviceTables[DeviceId];
215
216 // 4) Verify whether the two table sizes match.
217 size_t Hsize =
218 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
219 size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
220
221 // Invalid image for these host entries!
222 if (Hsize != Tsize) {
223 REPORT(
224 "Host and Target tables mismatch for device id %d [%zx != %zx].\n",
225 DeviceId, Hsize, Tsize);
226 TransTable->TargetsImages[DeviceId] = 0;
227 TransTable->TargetsTable[DeviceId] = 0;
228 Rc = OFFLOAD_FAIL;
229 break;
230 }
231
232 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
233 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
234
235 __tgt_target_table *HostTable = &TransTable->HostTable;
236 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
237 *CurrHostEntry = HostTable->EntriesBegin,
238 *EntryDeviceEnd = TargetTable->EntriesEnd;
239 CurrDeviceEntry != EntryDeviceEnd;
240 CurrDeviceEntry++, CurrHostEntry++) {
241 if (CurrDeviceEntry->size == 0)
242 continue;
243
244 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
245 "data size mismatch");
246
247 // Fortran may use multiple weak declarations for the same symbol,
248 // therefore we must allow for multiple weak symbols to be loaded from
249 // the fat binary. Treat these mappings as any other "regular"
250 // mapping. Add entry to map.
251 if (Device.getMappingInfo().getTgtPtrBegin(HDTTMap, HstPtrBegin: CurrHostEntry->addr,
252 Size: CurrHostEntry->size))
253 continue;
254
255 void *CurrDeviceEntryAddr = CurrDeviceEntry->addr;
256
257 // For indirect mapping, follow the indirection and map the actual
258 // target.
259 if (CurrDeviceEntry->flags & OMP_DECLARE_TARGET_INDIRECT) {
260 AsyncInfoTy AsyncInfo(Device);
261 void *DevPtr;
262 Device.retrieveData(HstPtrBegin: &DevPtr, TgtPtrBegin: CurrDeviceEntryAddr, Size: sizeof(void *),
263 AsyncInfo, /*Entry=*/nullptr, HDTTMapPtr: &HDTTMap);
264 if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
265 return OFFLOAD_FAIL;
266 CurrDeviceEntryAddr = DevPtr;
267 }
268
269 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
270 ", name \"%s\"\n",
271 DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
272 CurrDeviceEntry->size, CurrDeviceEntry->name);
273 HDTTMap->emplace(args: new HostDataToTargetTy(
274 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
275 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
276 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
277 (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/,
278 (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/,
279 false /*UseHoldRefCount*/, CurrHostEntry->name,
280 true /*IsRefCountINF*/));
281
282 // Notify about the new mapping.
283 if (Device.notifyDataMapped(HstPtr: CurrHostEntry->addr, Size: CurrHostEntry->size))
284 return OFFLOAD_FAIL;
285 }
286 }
287 }
288
289 if (Rc != OFFLOAD_SUCCESS)
290 return Rc;
291
292 static Int32Envar DumpOffloadEntries =
293 Int32Envar("OMPTARGET_DUMP_OFFLOAD_ENTRIES", -1);
294 if (DumpOffloadEntries.get() == DeviceId)
295 Device.dumpOffloadEntries();
296
297 return OFFLOAD_SUCCESS;
298}
299
300void handleTargetOutcome(bool Success, ident_t *Loc) {
301 switch (OffloadPolicy::get(PM&: *PM).Kind) {
302 case OffloadPolicy::DISABLED:
303 if (Success) {
304 FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled");
305 }
306 break;
307 case OffloadPolicy::MANDATORY:
308 if (!Success) {
309 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
310 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
311 for (auto &Device : PM->devices(DevicesAccessor&: ExclusiveDevicesAccessor))
312 dumpTargetPointerMappings(Loc, Device);
313 } else
314 FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "
315 "for debugging options.\n");
316
317 if (!PM->getNumUsedPlugins()) {
318 FAILURE_MESSAGE(
319 "No images found compatible with the installed hardware. ");
320
321 llvm::SmallVector<llvm::StringRef> Archs;
322 for (auto &Image : PM->deviceImages()) {
323 const char *Start = reinterpret_cast<const char *>(
324 Image.getExecutableImage().ImageStart);
325 uint64_t Length = llvm::omp::target::getPtrDiff(
326 End: Start, Begin: Image.getExecutableImage().ImageEnd);
327 llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length),
328 /*Identifier=*/"");
329
330 auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Object: Buffer);
331 if (auto Err = ObjectOrErr.takeError()) {
332 llvm::consumeError(Err: std::move(Err));
333 continue;
334 }
335
336 if (auto CPU = (*ObjectOrErr)->tryGetCPUName())
337 Archs.push_back(Elt: *CPU);
338 }
339 fprintf(stderr, format: "Found %zu image(s): (%s)\n", Archs.size(),
340 llvm::join(R&: Archs, Separator: ",").c_str());
341 }
342
343 SourceInfo Info(Loc);
344 if (Info.isAvailible())
345 fprintf(stderr, format: "%s:%d:%d: ", Info.getFilename(), Info.getLine(),
346 Info.getColumn());
347 else
348 FAILURE_MESSAGE("Source location information not present. Compile with "
349 "-g or -gline-tables-only.\n");
350 FATAL_MESSAGE0(
351 1, "failure of target construct while offloading is mandatory");
352 } else {
353 if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) {
354 auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor();
355 for (auto &Device : PM->devices(DevicesAccessor&: ExclusiveDevicesAccessor))
356 dumpTargetPointerMappings(Loc, Device);
357 }
358 }
359 break;
360 }
361}
362
363// If offload is enabled, ensure that device DeviceID has been initialized,
364// global ctors have been executed, and global data has been mapped.
365//
366// The return bool indicates if the offload is to the host device
367// There are three possible results:
368// - Return false if the taregt device is ready for offload
369// - Return true without reporting a runtime error if offload is
370// disabled, perhaps because the initial device was specified.
371// - Report a runtime error and return true.
372//
373// If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device.
374// This step might be skipped if offload is disabled.
375bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) {
376 if (OffloadPolicy::get(PM&: *PM).Kind == OffloadPolicy::DISABLED) {
377 DP("Offload is disabled\n");
378 return true;
379 }
380
381 if (DeviceID == OFFLOAD_DEVICE_DEFAULT) {
382 DeviceID = omp_get_default_device();
383 DP("Use default device id %" PRId64 "\n", DeviceID);
384 }
385
386 // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669.
387 if (omp_get_num_devices() == 0) {
388 DP("omp_get_num_devices() == 0 but offload is manadatory\n");
389 handleTargetOutcome(Success: false, Loc);
390 return true;
391 }
392
393 if (DeviceID == omp_get_initial_device()) {
394 DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",
395 DeviceID);
396 return true;
397 }
398
399 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceID);
400 if (!DeviceOrErr)
401 FATAL_MESSAGE(DeviceID, "%s", toString(DeviceOrErr.takeError()).data());
402
403 // Check whether global data has been mapped for this device
404 if (initLibrary(Device&: *DeviceOrErr) != OFFLOAD_SUCCESS) {
405 REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID);
406 handleTargetOutcome(Success: false, Loc);
407 return true;
408 }
409
410 return false;
411}
412
413static int32_t getParentIndex(int64_t Type) {
414 return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
415}
416
417void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind,
418 const char *Name) {
419 DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size);
420
421 if (Size <= 0) {
422 DP("Call to %s with non-positive length\n", Name);
423 return NULL;
424 }
425
426 void *Rc = NULL;
427
428 if (DeviceNum == omp_get_initial_device()) {
429 Rc = malloc(size: Size);
430 DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc));
431 return Rc;
432 }
433
434 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum);
435 if (!DeviceOrErr)
436 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
437
438 Rc = DeviceOrErr->allocData(Size, HstPtr: nullptr, Kind);
439 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc));
440 return Rc;
441}
442
443void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind,
444 const char *Name) {
445 DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,
446 DPxPTR(DevicePtr));
447
448 if (!DevicePtr) {
449 DP("Call to %s with NULL ptr\n", Name);
450 return;
451 }
452
453 if (DeviceNum == omp_get_initial_device()) {
454 free(ptr: DevicePtr);
455 DP("%s deallocated host ptr\n", Name);
456 return;
457 }
458
459 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum);
460 if (!DeviceOrErr)
461 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
462
463 DeviceOrErr->deleteData(TgtPtrBegin: DevicePtr, Kind);
464 DP("omp_target_free deallocated device ptr\n");
465}
466
467void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
468 const char *Name) {
469 DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size);
470
471 if (Size <= 0) {
472 DP("Call to %s with non-positive length\n", Name);
473 return NULL;
474 }
475
476 void *RC = NULL;
477
478 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum);
479 if (!DeviceOrErr)
480 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
481
482 int32_t Err = 0;
483 if (!DeviceOrErr->RTL->data_lock) {
484 Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC);
485 if (Err) {
486 DP("Could not lock ptr %p\n", HostPtr);
487 return nullptr;
488 }
489 }
490 DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(RC));
491 return RC;
492}
493
494void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) {
495 DP("Call to %s for device %d unlocking\n", Name, DeviceNum);
496
497 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum);
498 if (!DeviceOrErr)
499 FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
500
501 if (!DeviceOrErr->RTL->data_unlock)
502 DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr);
503
504 DP("%s returns\n", Name);
505}
506
507/// Call the user-defined mapper function followed by the appropriate
508// targetData* function (targetData{Begin,End,Update}).
509int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg,
510 int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames,
511 void *ArgMapper, AsyncInfoTy &AsyncInfo,
512 TargetDataFuncPtrTy TargetDataFunction) {
513 DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper));
514
515 // The mapper function fills up Components.
516 MapperComponentsTy MapperComponents;
517 MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper);
518 (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType,
519 ArgNames);
520
521 // Construct new arrays for args_base, args, arg_sizes and arg_types
522 // using the information in MapperComponents and call the corresponding
523 // targetData* function using these new arrays.
524 SmallVector<void *> MapperArgsBase(MapperComponents.Components.size());
525 SmallVector<void *> MapperArgs(MapperComponents.Components.size());
526 SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size());
527 SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size());
528 SmallVector<void *> MapperArgNames(MapperComponents.Components.size());
529
530 for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) {
531 auto &C = MapperComponents.Components[I];
532 MapperArgsBase[I] = C.Base;
533 MapperArgs[I] = C.Begin;
534 MapperArgSizes[I] = C.Size;
535 MapperArgTypes[I] = C.Type;
536 MapperArgNames[I] = C.Name;
537 }
538
539 int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(),
540 MapperArgsBase.data(), MapperArgs.data(),
541 MapperArgSizes.data(), MapperArgTypes.data(),
542 MapperArgNames.data(), /*arg_mappers*/ nullptr,
543 AsyncInfo, /*FromMapper=*/true);
544
545 return Rc;
546}
547
548/// Internal function to do the mapping and transfer the data to the device
549int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
550 void **ArgsBase, void **Args, int64_t *ArgSizes,
551 int64_t *ArgTypes, map_var_info_t *ArgNames,
552 void **ArgMappers, AsyncInfoTy &AsyncInfo,
553 bool FromMapper) {
554 // process each input.
555 for (int32_t I = 0; I < ArgNum; ++I) {
556 // Ignore private variables and arrays - there is no mapping for them.
557 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
558 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
559 continue;
560 TIMESCOPE_WITH_DETAILS_AND_IDENT(
561 "HostToDev", "Size=" + std::to_string(ArgSizes[I]) + "B", Loc);
562 if (ArgMappers && ArgMappers[I]) {
563 // Instead of executing the regular path of targetDataBegin, call the
564 // targetDataMapper variant which will call targetDataBegin again
565 // with new arguments.
566 DP("Calling targetDataMapper for the %dth argument\n", I);
567
568 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
569 int Rc = targetDataMapper(Loc, Device, ArgBase: ArgsBase[I], Arg: Args[I], ArgSize: ArgSizes[I],
570 ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo,
571 TargetDataFunction: targetDataBegin);
572
573 if (Rc != OFFLOAD_SUCCESS) {
574 REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
575 " failed.\n");
576 return OFFLOAD_FAIL;
577 }
578
579 // Skip the rest of this function, continue to the next argument.
580 continue;
581 }
582
583 void *HstPtrBegin = Args[I];
584 void *HstPtrBase = ArgsBase[I];
585 int64_t DataSize = ArgSizes[I];
586 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
587
588 // Adjust for proper alignment if this is a combined entry (for structs).
589 // Look at the next argument - if that is MEMBER_OF this one, then this one
590 // is a combined entry.
591 int64_t TgtPadding = 0;
592 const int NextI = I + 1;
593 if (getParentIndex(Type: ArgTypes[I]) < 0 && NextI < ArgNum &&
594 getParentIndex(Type: ArgTypes[NextI]) == I) {
595 int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase);
596 TgtPadding = (int64_t)HstPtrBegin % Alignment;
597 if (TgtPadding) {
598 DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD
599 "\n",
600 TgtPadding, DPxPTR(HstPtrBegin));
601 }
602 }
603
604 // Address of pointer on the host and device, respectively.
605 void *PointerHstPtrBegin, *PointerTgtPtrBegin;
606 TargetPointerResultTy PointerTpr;
607 bool IsHostPtr = false;
608 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
609 // Force the creation of a device side copy of the data when:
610 // a close map modifier was associated with a map that contained a to.
611 bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE;
612 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
613 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
614 // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
615 // have reached this point via __tgt_target_data_begin and not __tgt_target
616 // then no argument is marked as TARGET_PARAM ("omp target data map" is not
617 // associated with a target region, so there are no target parameters). This
618 // may be considered a hack, we could revise the scheme in the future.
619 bool UpdateRef =
620 !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0);
621
622 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
623 Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
624 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
625 DP("Has a pointer entry: \n");
626 // Base is address of pointer.
627 //
628 // Usually, the pointer is already allocated by this time. For example:
629 //
630 // #pragma omp target map(s.p[0:N])
631 //
632 // The map entry for s comes first, and the PTR_AND_OBJ entry comes
633 // afterward, so the pointer is already allocated by the time the
634 // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus
635 // non-null. However, "declare target link" can produce a PTR_AND_OBJ
636 // entry for a global that might not already be allocated by the time the
637 // PTR_AND_OBJ entry is handled below, and so the allocation might fail
638 // when HasPresentModifier.
639 PointerTpr = Device.getMappingInfo().getTargetPointer(
640 HDTTMap, HstPtrBegin: HstPtrBase, HstPtrBase, /*TgtPadding=*/0, Size: sizeof(void *),
641 /*HstPtrName=*/nullptr,
642 /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRefCount: UpdateRef,
643 HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo,
644 /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false);
645 PointerTgtPtrBegin = PointerTpr.TargetPointer;
646 IsHostPtr = PointerTpr.Flags.IsHostPointer;
647 if (!PointerTgtPtrBegin) {
648 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
649 HasPresentModifier ? "'present' map type modifier"
650 : "device failure or illegal mapping");
651 return OFFLOAD_FAIL;
652 }
653 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
654 "\n",
655 sizeof(void *), DPxPTR(PointerTgtPtrBegin),
656 (PointerTpr.Flags.IsNewEntry ? "" : " not"));
657 PointerHstPtrBegin = HstPtrBase;
658 // modify current entry.
659 HstPtrBase = *(void **)HstPtrBase;
660 // No need to update pointee ref count for the first element of the
661 // subelement that comes from mapper.
662 UpdateRef =
663 (!FromMapper || I != 0); // subsequently update ref count of pointee
664 }
665
666 const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO;
667 const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
668 // Note that HDTTMap will be released in getTargetPointer.
669 auto TPR = Device.getMappingInfo().getTargetPointer(
670 HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, Size: DataSize, HstPtrName,
671 HasFlagTo, HasFlagAlways, IsImplicit, UpdateRefCount: UpdateRef, HasCloseModifier,
672 HasPresentModifier, HasHoldModifier, AsyncInfo, OwnedTPR: PointerTpr.getEntry());
673 void *TgtPtrBegin = TPR.TargetPointer;
674 IsHostPtr = TPR.Flags.IsHostPointer;
675 // If data_size==0, then the argument could be a zero-length pointer to
676 // NULL, so getOrAlloc() returning NULL is not an error.
677 if (!TgtPtrBegin && (DataSize || HasPresentModifier)) {
678 REPORT("Call to getTargetPointer returned null pointer (%s).\n",
679 HasPresentModifier ? "'present' map type modifier"
680 : "device failure or illegal mapping");
681 return OFFLOAD_FAIL;
682 }
683 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
684 " - is%s new\n",
685 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not"));
686
687 if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
688 uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase;
689 void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta);
690 DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase));
691 ArgsBase[I] = TgtPtrBase;
692 }
693
694 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
695
696 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
697 void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
698
699 if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfo: ShadowPtrInfoTy{
700 .HstPtrAddr: (void **)PointerHstPtrBegin, .HstPtrVal: HstPtrBase,
701 .TgtPtrAddr: (void **)PointerTgtPtrBegin, .TgtPtrVal: ExpectedTgtPtrBase})) {
702 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
703 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
704
705 void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation();
706 TgtPtrBase = ExpectedTgtPtrBase;
707
708 int Ret =
709 Device.submitData(TgtPtrBegin: PointerTgtPtrBegin, HstPtrBegin: &TgtPtrBase, Size: sizeof(void *),
710 AsyncInfo, Entry: PointerTpr.getEntry());
711 if (Ret != OFFLOAD_SUCCESS) {
712 REPORT("Copying data to device failed.\n");
713 return OFFLOAD_FAIL;
714 }
715 if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
716 OFFLOAD_SUCCESS)
717 return OFFLOAD_FAIL;
718 }
719 }
720
721 // Check if variable can be used on the device:
722 bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF;
723 if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 &&
724 !IsStructMember && !IsImplicit && !TPR.isPresent() &&
725 !TPR.isContained() && !TPR.isHostPointer())
726 INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,
727 "variable %s does not have a valid device counterpart\n",
728 (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
729 }
730
731 return OFFLOAD_SUCCESS;
732}
733
734namespace {
735/// This structure contains information to deallocate a target pointer, aka.
736/// used to fix up the shadow map and potentially delete the entry from the
737/// mapping table via \p DeviceTy::deallocTgtPtr.
738struct PostProcessingInfo {
739 /// Host pointer used to look up into the map table
740 void *HstPtrBegin;
741
742 /// Size of the data
743 int64_t DataSize;
744
745 /// The mapping type (bitfield).
746 int64_t ArgType;
747
748 /// The target pointer information.
749 TargetPointerResultTy TPR;
750
751 PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType,
752 TargetPointerResultTy &&TPR)
753 : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType),
754 TPR(std::move(TPR)) {}
755};
756
757} // namespace
758
759/// Applies the necessary post-processing procedures to entries listed in \p
760/// EntriesInfo after the execution of all device side operations from a target
761/// data end. This includes the update of pointers at the host and removal of
762/// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS
763/// according to the successfulness of the operations.
764[[nodiscard]] static int
765postProcessingTargetDataEnd(DeviceTy *Device,
766 SmallVector<PostProcessingInfo> &EntriesInfo) {
767 int Ret = OFFLOAD_SUCCESS;
768
769 for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) {
770 bool DelEntry = !TPR.isHostPointer();
771
772 // If the last element from the mapper (for end transfer args comes in
773 // reverse order), do not remove the partial entry, the parent struct still
774 // exists.
775 if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) &&
776 !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
777 DelEntry = false; // protect parent struct from being deallocated
778 }
779
780 // If we marked the entry to be deleted we need to verify no other
781 // thread reused it by now. If deletion is still supposed to happen by
782 // this thread LR will be set and exclusive access to the HDTT map
783 // will avoid another thread reusing the entry now. Note that we do
784 // not request (exclusive) access to the HDTT map if DelEntry is
785 // not set.
786 MappingInfoTy::HDTTMapAccessorTy HDTTMap =
787 Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor();
788
789 // We cannot use a lock guard because we may end up delete the mutex.
790 // We also explicitly unlocked the entry after it was put in the EntriesInfo
791 // so it can be reused.
792 TPR.getEntry()->lock();
793 auto *Entry = TPR.getEntry();
794
795 const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0;
796 if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) {
797 // The thread is not in charge of deletion anymore. Give up access
798 // to the HDTT map and unset the deletion flag.
799 HDTTMap.destroy();
800 DelEntry = false;
801 }
802
803 // If we copied back to the host a struct/array containing pointers,
804 // we need to restore the original host pointer values from their
805 // shadow copies. If the struct is going to be deallocated, remove any
806 // remaining shadow pointer entries for this struct.
807 const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
808 if (HasFrom) {
809 Entry->foreachShadowPointerInfo(CB: [&](const ShadowPtrInfoTy &ShadowPtr) {
810 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
811 DP("Restoring original host pointer value " DPxMOD " for host "
812 "pointer " DPxMOD "\n",
813 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
814 return OFFLOAD_SUCCESS;
815 });
816 }
817
818 // Give up the lock as we either don't need it anymore (e.g., done with
819 // TPR), or erase TPR.
820 TPR.setEntry(HDTTT: nullptr);
821
822 if (!DelEntry)
823 continue;
824
825 Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, Size: DataSize);
826 // Entry is already remove from the map, we can unlock it now.
827 HDTTMap.destroy();
828 Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, Size: DataSize);
829 if (Ret != OFFLOAD_SUCCESS) {
830 REPORT("Deallocating data from device failed.\n");
831 break;
832 }
833 }
834
835 delete &EntriesInfo;
836 return Ret;
837}
838
839/// Internal function to undo the mapping and retrieve the data from the device.
840int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
841 void **ArgBases, void **Args, int64_t *ArgSizes,
842 int64_t *ArgTypes, map_var_info_t *ArgNames,
843 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) {
844 int Ret = OFFLOAD_SUCCESS;
845 auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>();
846 // process each input.
847 for (int32_t I = ArgNum - 1; I >= 0; --I) {
848 // Ignore private variables and arrays - there is no mapping for them.
849 // Also, ignore the use_device_ptr directive, it has no effect here.
850 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
851 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
852 continue;
853
854 if (ArgMappers && ArgMappers[I]) {
855 // Instead of executing the regular path of targetDataEnd, call the
856 // targetDataMapper variant which will call targetDataEnd again
857 // with new arguments.
858 DP("Calling targetDataMapper for the %dth argument\n", I);
859
860 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
861 Ret = targetDataMapper(Loc, Device, ArgBase: ArgBases[I], Arg: Args[I], ArgSize: ArgSizes[I],
862 ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo,
863 TargetDataFunction: targetDataEnd);
864
865 if (Ret != OFFLOAD_SUCCESS) {
866 REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
867 " failed.\n");
868 return OFFLOAD_FAIL;
869 }
870
871 // Skip the rest of this function, continue to the next argument.
872 continue;
873 }
874
875 void *HstPtrBegin = Args[I];
876 int64_t DataSize = ArgSizes[I];
877 bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT;
878 bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
879 (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) &&
880 !(FromMapper && I == 0);
881 bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE;
882 bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT;
883 bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD;
884
885 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
886 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
887 HstPtrBegin, Size: DataSize, UpdateRefCount: UpdateRef, UseHoldRefCount: HasHoldModifier, MustContain: !IsImplicit,
888 ForceDelete, /*FromDataEnd=*/true);
889 void *TgtPtrBegin = TPR.TargetPointer;
890 if (!TPR.isPresent() && !TPR.isHostPointer() &&
891 (DataSize || HasPresentModifier)) {
892 DP("Mapping does not exist (%s)\n",
893 (HasPresentModifier ? "'present' map type modifier" : "ignored"));
894 if (HasPresentModifier) {
895 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13:
896 // "If a map clause appears on a target, target data, target enter data
897 // or target exit data construct with a present map-type-modifier then
898 // on entry to the region if the corresponding list item does not appear
899 // in the device data environment then an error occurs and the program
900 // terminates."
901 //
902 // This should be an error upon entering an "omp target exit data". It
903 // should not be an error upon exiting an "omp target data" or "omp
904 // target". For "omp target data", Clang thus doesn't include present
905 // modifiers for end calls. For "omp target", we have not found a valid
906 // OpenMP program for which the error matters: it appears that, if a
907 // program can guarantee that data is present at the beginning of an
908 // "omp target" region so that there's no error there, that data is also
909 // guaranteed to be present at the end.
910 MESSAGE("device mapping required by 'present' map type modifier does "
911 "not exist for host address " DPxMOD " (%" PRId64 " bytes)",
912 DPxPTR(HstPtrBegin), DataSize);
913 return OFFLOAD_FAIL;
914 }
915 } else {
916 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
917 " - is%s last\n",
918 DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not"));
919 }
920
921 // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16:
922 // "If the map clause appears on a target, target data, or target exit data
923 // construct and a corresponding list item of the original list item is not
924 // present in the device data environment on exit from the region then the
925 // list item is ignored."
926 if (!TPR.isPresent())
927 continue;
928
929 // Move data back to the host
930 const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS;
931 const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM;
932 if (HasFrom && (HasAlways || TPR.Flags.IsLast) &&
933 !TPR.Flags.IsHostPointer && DataSize != 0) {
934 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
935 DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
936 TIMESCOPE_WITH_DETAILS_AND_IDENT(
937 "DevToHost", "Size=" + std::to_string(DataSize) + "B", Loc);
938 // Wait for any previous transfer if an event is present.
939 if (void *Event = TPR.getEntry()->getEvent()) {
940 if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
941 REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
942 return OFFLOAD_FAIL;
943 }
944 }
945
946 Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, Size: DataSize, AsyncInfo,
947 Entry: TPR.getEntry());
948 if (Ret != OFFLOAD_SUCCESS) {
949 REPORT("Copying data from device failed.\n");
950 return OFFLOAD_FAIL;
951 }
952
953 // As we are expecting to delete the entry the d2h copy might race
954 // with another one that also tries to delete the entry. This happens
955 // as the entry can be reused and the reuse might happen after the
956 // copy-back was issued but before it completed. Since the reuse might
957 // also copy-back a value we would race.
958 if (TPR.Flags.IsLast) {
959 if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) !=
960 OFFLOAD_SUCCESS)
961 return OFFLOAD_FAIL;
962 }
963 }
964
965 // Add pointer to the buffer for post-synchronize processing.
966 PostProcessingPtrs->emplace_back(Args&: HstPtrBegin, Args&: DataSize, Args&: ArgTypes[I],
967 Args: std::move(TPR));
968 PostProcessingPtrs->back().TPR.getEntry()->unlock();
969 }
970
971 // Add post-processing functions
972 // TODO: We might want to remove `mutable` in the future by not changing the
973 // captured variables somehow.
974 AsyncInfo.addPostProcessingFunction(Function: [=, Device = &Device]() mutable -> int {
975 return postProcessingTargetDataEnd(Device, EntriesInfo&: *PostProcessingPtrs);
976 });
977
978 return Ret;
979}
980
981static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase,
982 void *HstPtrBegin, int64_t ArgSize,
983 int64_t ArgType, AsyncInfoTy &AsyncInfo) {
984 TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin(
985 HstPtrBegin, Size: ArgSize, /*UpdateRefCount=*/false,
986 /*UseHoldRefCount=*/false, /*MustContain=*/true);
987 void *TgtPtrBegin = TPR.TargetPointer;
988 if (!TPR.isPresent()) {
989 DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
990 if (ArgType & OMP_TGT_MAPTYPE_PRESENT) {
991 MESSAGE("device mapping required by 'present' motion modifier does not "
992 "exist for host address " DPxMOD " (%" PRId64 " bytes)",
993 DPxPTR(HstPtrBegin), ArgSize);
994 return OFFLOAD_FAIL;
995 }
996 return OFFLOAD_SUCCESS;
997 }
998
999 if (TPR.Flags.IsHostPointer) {
1000 DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
1001 DPxPTR(HstPtrBegin));
1002 return OFFLOAD_SUCCESS;
1003 }
1004
1005 if (ArgType & OMP_TGT_MAPTYPE_TO) {
1006 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1007 ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1008 int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, Size: ArgSize, AsyncInfo,
1009 Entry: TPR.getEntry());
1010 if (Ret != OFFLOAD_SUCCESS) {
1011 REPORT("Copying data to device failed.\n");
1012 return OFFLOAD_FAIL;
1013 }
1014 if (TPR.getEntry()) {
1015 int Ret = TPR.getEntry()->foreachShadowPointerInfo(
1016 CB: [&](ShadowPtrInfoTy &ShadowPtr) {
1017 DP("Restoring original target pointer value " DPxMOD " for target "
1018 "pointer " DPxMOD "\n",
1019 DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr));
1020 Ret = Device.submitData(TgtPtrBegin: ShadowPtr.TgtPtrAddr,
1021 HstPtrBegin: (void *)&ShadowPtr.TgtPtrVal,
1022 Size: sizeof(void *), AsyncInfo);
1023 if (Ret != OFFLOAD_SUCCESS) {
1024 REPORT("Copying data to device failed.\n");
1025 return OFFLOAD_FAIL;
1026 }
1027 return OFFLOAD_SUCCESS;
1028 });
1029 if (Ret != OFFLOAD_SUCCESS) {
1030 DP("Updating shadow map failed\n");
1031 return Ret;
1032 }
1033 }
1034 }
1035
1036 if (ArgType & OMP_TGT_MAPTYPE_FROM) {
1037 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1038 ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1039 int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, Size: ArgSize, AsyncInfo,
1040 Entry: TPR.getEntry());
1041 if (Ret != OFFLOAD_SUCCESS) {
1042 REPORT("Copying data from device failed.\n");
1043 return OFFLOAD_FAIL;
1044 }
1045
1046 // Wait for device-to-host memcopies for whole struct to complete,
1047 // before restoring the correct host pointer.
1048 if (auto *Entry = TPR.getEntry()) {
1049 AsyncInfo.addPostProcessingFunction(Function: [=]() -> int {
1050 int Ret = Entry->foreachShadowPointerInfo(
1051 CB: [&](const ShadowPtrInfoTy &ShadowPtr) {
1052 *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
1053 DP("Restoring original host pointer value " DPxMOD
1054 " for host pointer " DPxMOD "\n",
1055 DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
1056 return OFFLOAD_SUCCESS;
1057 });
1058 Entry->unlock();
1059 if (Ret != OFFLOAD_SUCCESS) {
1060 DP("Updating shadow map failed\n");
1061 return Ret;
1062 }
1063 return OFFLOAD_SUCCESS;
1064 });
1065 }
1066 }
1067
1068 return OFFLOAD_SUCCESS;
1069}
1070
1071static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device,
1072 void *ArgsBase,
1073 __tgt_target_non_contig *NonContig,
1074 uint64_t Size, int64_t ArgType,
1075 int CurrentDim, int DimSize, uint64_t Offset,
1076 AsyncInfoTy &AsyncInfo) {
1077 int Ret = OFFLOAD_SUCCESS;
1078 if (CurrentDim < DimSize) {
1079 for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) {
1080 uint64_t CurOffset =
1081 (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride;
1082 // we only need to transfer the first element for the last dimension
1083 // since we've already got a contiguous piece.
1084 if (CurrentDim != DimSize - 1 || I == 0) {
1085 Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size,
1086 ArgType, CurrentDim: CurrentDim + 1, DimSize,
1087 Offset: Offset + CurOffset, AsyncInfo);
1088 // Stop the whole process if any contiguous piece returns anything
1089 // other than OFFLOAD_SUCCESS.
1090 if (Ret != OFFLOAD_SUCCESS)
1091 return Ret;
1092 }
1093 }
1094 } else {
1095 char *Ptr = (char *)ArgsBase + Offset;
1096 DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
1097 " len %" PRIu64 "\n",
1098 DPxPTR(Ptr), Offset, Size);
1099 Ret = targetDataContiguous(Loc, Device, ArgsBase, HstPtrBegin: Ptr, ArgSize: Size, ArgType,
1100 AsyncInfo);
1101 }
1102 return Ret;
1103}
1104
1105static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
1106 int32_t DimSize) {
1107 int RemovedDim = 0;
1108 for (int I = DimSize - 1; I > 0; --I) {
1109 if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride)
1110 RemovedDim++;
1111 }
1112 return RemovedDim;
1113}
1114
1115/// Internal function to pass data to/from the target.
1116int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum,
1117 void **ArgsBase, void **Args, int64_t *ArgSizes,
1118 int64_t *ArgTypes, map_var_info_t *ArgNames,
1119 void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) {
1120 // process each input.
1121 for (int32_t I = 0; I < ArgNum; ++I) {
1122 if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
1123 (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE))
1124 continue;
1125
1126 if (ArgMappers && ArgMappers[I]) {
1127 // Instead of executing the regular path of targetDataUpdate, call the
1128 // targetDataMapper variant which will call targetDataUpdate again
1129 // with new arguments.
1130 DP("Calling targetDataMapper for the %dth argument\n", I);
1131
1132 map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
1133 int Ret = targetDataMapper(Loc, Device, ArgBase: ArgsBase[I], Arg: Args[I], ArgSize: ArgSizes[I],
1134 ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo,
1135 TargetDataFunction: targetDataUpdate);
1136
1137 if (Ret != OFFLOAD_SUCCESS) {
1138 REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"
1139 " failed.\n");
1140 return OFFLOAD_FAIL;
1141 }
1142
1143 // Skip the rest of this function, continue to the next argument.
1144 continue;
1145 }
1146
1147 int Ret = OFFLOAD_SUCCESS;
1148
1149 if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) {
1150 __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I];
1151 int32_t DimSize = ArgSizes[I];
1152 uint64_t Size =
1153 NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride;
1154 int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
1155 Ret = targetDataNonContiguous(
1156 Loc, Device, ArgsBase: ArgsBase[I], NonContig, Size, ArgType: ArgTypes[I],
1157 /*current_dim=*/CurrentDim: 0, DimSize: DimSize - MergedDim, /*offset=*/Offset: 0, AsyncInfo);
1158 } else {
1159 Ret = targetDataContiguous(Loc, Device, ArgsBase: ArgsBase[I], HstPtrBegin: Args[I], ArgSize: ArgSizes[I],
1160 ArgType: ArgTypes[I], AsyncInfo);
1161 }
1162 if (Ret == OFFLOAD_FAIL)
1163 return OFFLOAD_FAIL;
1164 }
1165 return OFFLOAD_SUCCESS;
1166}
1167
1168static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ |
1169 OMP_TGT_MAPTYPE_LITERAL |
1170 OMP_TGT_MAPTYPE_IMPLICIT;
1171static bool isLambdaMapping(int64_t Mapping) {
1172 return (Mapping & LambdaMapping) == LambdaMapping;
1173}
1174
1175namespace {
1176/// Find the table information in the map or look it up in the translation
1177/// tables.
1178TableMap *getTableMap(void *HostPtr) {
1179 std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx);
1180 HostPtrToTableMapTy::iterator TableMapIt =
1181 PM->HostPtrToTableMap.find(x: HostPtr);
1182
1183 if (TableMapIt != PM->HostPtrToTableMap.end())
1184 return &TableMapIt->second;
1185
1186 // We don't have a map. So search all the registered libraries.
1187 TableMap *TM = nullptr;
1188 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1189 for (HostEntriesBeginToTransTableTy::iterator Itr =
1190 PM->HostEntriesBeginToTransTable.begin();
1191 Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) {
1192 // get the translation table (which contains all the good info).
1193 TranslationTable *TransTable = &Itr->second;
1194 // iterate over all the host table entries to see if we can locate the
1195 // host_ptr.
1196 __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin;
1197 for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) {
1198 if (Cur->addr != HostPtr)
1199 continue;
1200 // we got a match, now fill the HostPtrToTableMap so that we
1201 // may avoid this search next time.
1202 TM = &(PM->HostPtrToTableMap)[HostPtr];
1203 TM->Table = TransTable;
1204 TM->Index = I;
1205 return TM;
1206 }
1207 }
1208
1209 return nullptr;
1210}
1211
1212/// A class manages private arguments in a target region.
1213class PrivateArgumentManagerTy {
1214 /// A data structure for the information of first-private arguments. We can
1215 /// use this information to optimize data transfer by packing all
1216 /// first-private arguments and transfer them all at once.
1217 struct FirstPrivateArgInfoTy {
1218 /// Host pointer begin
1219 char *HstPtrBegin;
1220 /// Host pointer end
1221 char *HstPtrEnd;
1222 /// The index of the element in \p TgtArgs corresponding to the argument
1223 int Index;
1224 /// Alignment of the entry (base of the entry, not after the entry).
1225 uint32_t Alignment;
1226 /// Size (without alignment, see padding)
1227 uint32_t Size;
1228 /// Padding used to align this argument entry, if necessary.
1229 uint32_t Padding;
1230 /// Host pointer name
1231 map_var_info_t HstPtrName = nullptr;
1232
1233 FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size,
1234 uint32_t Alignment, uint32_t Padding,
1235 map_var_info_t HstPtrName = nullptr)
1236 : HstPtrBegin(reinterpret_cast<char *>(HstPtr)),
1237 HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment),
1238 Size(Size), Padding(Padding), HstPtrName(HstPtrName) {}
1239 };
1240
1241 /// A vector of target pointers for all private arguments
1242 SmallVector<void *> TgtPtrs;
1243
1244 /// A vector of information of all first-private arguments to be packed
1245 SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo;
1246 /// Host buffer for all arguments to be packed
1247 SmallVector<char> FirstPrivateArgBuffer;
1248 /// The total size of all arguments to be packed
1249 int64_t FirstPrivateArgSize = 0;
1250
1251 /// A reference to the \p DeviceTy object
1252 DeviceTy &Device;
1253 /// A pointer to a \p AsyncInfoTy object
1254 AsyncInfoTy &AsyncInfo;
1255
1256 // TODO: What would be the best value here? Should we make it configurable?
1257 // If the size is larger than this threshold, we will allocate and transfer it
1258 // immediately instead of packing it.
1259 static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024;
1260
1261public:
1262 /// Constructor
1263 PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
1264 : Device(Dev), AsyncInfo(AsyncInfo) {}
1265
1266 /// Add a private argument
1267 int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset,
1268 bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex,
1269 map_var_info_t HstPtrName = nullptr,
1270 const bool AllocImmediately = false) {
1271 // If the argument is not first-private, or its size is greater than a
1272 // predefined threshold, we will allocate memory and issue the transfer
1273 // immediately.
1274 if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate ||
1275 AllocImmediately) {
1276 TgtPtr = Device.allocData(Size: ArgSize, HstPtr);
1277 if (!TgtPtr) {
1278 DP("Data allocation for %sprivate array " DPxMOD " failed.\n",
1279 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr));
1280 return OFFLOAD_FAIL;
1281 }
1282#ifdef OMPTARGET_DEBUG
1283 void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset);
1284 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD
1285 " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD
1286 "\n",
1287 ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),
1288 DPxPTR(HstPtr), DPxPTR(TgtPtrBase));
1289#endif
1290 // If first-private, copy data from host
1291 if (IsFirstPrivate) {
1292 DP("Submitting firstprivate data to the device.\n");
1293 int Ret = Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: HstPtr, Size: ArgSize, AsyncInfo);
1294 if (Ret != OFFLOAD_SUCCESS) {
1295 DP("Copying data to device failed, failed.\n");
1296 return OFFLOAD_FAIL;
1297 }
1298 }
1299 TgtPtrs.push_back(Elt: TgtPtr);
1300 } else {
1301 DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",
1302 DPxPTR(HstPtr), ArgSize);
1303 // When reach this point, the argument must meet all following
1304 // requirements:
1305 // 1. Its size does not exceed the threshold (see the comment for
1306 // FirstPrivateArgSizeThreshold);
1307 // 2. It must be first-private (needs to be mapped to target device).
1308 // We will pack all this kind of arguments to transfer them all at once
1309 // to reduce the number of data transfer. We will not take
1310 // non-first-private arguments, aka. private arguments that doesn't need
1311 // to be mapped to target device, into account because data allocation
1312 // can be very efficient with memory manager.
1313
1314 // Placeholder value
1315 TgtPtr = nullptr;
1316 auto *LastFPArgInfo =
1317 FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back();
1318
1319 // Compute the start alignment of this entry, add padding if necessary.
1320 // TODO: Consider sorting instead.
1321 uint32_t Padding = 0;
1322 uint32_t StartAlignment =
1323 LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment;
1324 if (LastFPArgInfo) {
1325 // Check if we keep the start alignment or if it is shrunk due to the
1326 // size of the last element.
1327 uint32_t Offset = LastFPArgInfo->Size % StartAlignment;
1328 if (Offset)
1329 StartAlignment = Offset;
1330 // We only need as much alignment as the host pointer had (since we
1331 // don't know the alignment information from the source we might end up
1332 // overaligning accesses but not too much).
1333 uint32_t RequiredAlignment =
1334 llvm::bit_floor(Value: getPartialStructRequiredAlignment(HstPtrBase: HstPtr));
1335 if (RequiredAlignment > StartAlignment) {
1336 Padding = RequiredAlignment - StartAlignment;
1337 StartAlignment = RequiredAlignment;
1338 }
1339 }
1340
1341 FirstPrivateArgInfo.emplace_back(Args&: TgtArgsIndex, Args&: HstPtr, Args&: ArgSize,
1342 Args&: StartAlignment, Args&: Padding, Args&: HstPtrName);
1343 FirstPrivateArgSize += Padding + ArgSize;
1344 }
1345
1346 return OFFLOAD_SUCCESS;
1347 }
1348
1349 /// Pack first-private arguments, replace place holder pointers in \p TgtArgs,
1350 /// and start the transfer.
1351 int packAndTransfer(SmallVector<void *> &TgtArgs) {
1352 if (!FirstPrivateArgInfo.empty()) {
1353 assert(FirstPrivateArgSize != 0 &&
1354 "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty");
1355 FirstPrivateArgBuffer.resize(N: FirstPrivateArgSize, NV: 0);
1356 auto *Itr = FirstPrivateArgBuffer.begin();
1357 // Copy all host data to this buffer
1358 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1359 // First pad the pointer as we (have to) pad it on the device too.
1360 Itr = std::next(x: Itr, n: Info.Padding);
1361 std::copy(first: Info.HstPtrBegin, last: Info.HstPtrEnd, result: Itr);
1362 Itr = std::next(x: Itr, n: Info.Size);
1363 }
1364 // Allocate target memory
1365 void *TgtPtr =
1366 Device.allocData(Size: FirstPrivateArgSize, HstPtr: FirstPrivateArgBuffer.data());
1367 if (TgtPtr == nullptr) {
1368 DP("Failed to allocate target memory for private arguments.\n");
1369 return OFFLOAD_FAIL;
1370 }
1371 TgtPtrs.push_back(Elt: TgtPtr);
1372 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",
1373 FirstPrivateArgSize, DPxPTR(TgtPtr));
1374 // Transfer data to target device
1375 int Ret = Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: FirstPrivateArgBuffer.data(),
1376 Size: FirstPrivateArgSize, AsyncInfo);
1377 if (Ret != OFFLOAD_SUCCESS) {
1378 DP("Failed to submit data of private arguments.\n");
1379 return OFFLOAD_FAIL;
1380 }
1381 // Fill in all placeholder pointers
1382 auto TP = reinterpret_cast<uintptr_t>(TgtPtr);
1383 for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) {
1384 void *&Ptr = TgtArgs[Info.Index];
1385 assert(Ptr == nullptr && "Target pointer is already set by mistaken");
1386 // Pad the device pointer to get the right alignment.
1387 TP += Info.Padding;
1388 Ptr = reinterpret_cast<void *>(TP);
1389 TP += Info.Size;
1390 DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD
1391 "\n",
1392 DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,
1393 DPxPTR(Ptr));
1394 }
1395 }
1396
1397 return OFFLOAD_SUCCESS;
1398 }
1399
1400 /// Free all target memory allocated for private arguments
1401 int free() {
1402 for (void *P : TgtPtrs) {
1403 int Ret = Device.deleteData(TgtPtrBegin: P);
1404 if (Ret != OFFLOAD_SUCCESS) {
1405 DP("Deallocation of (first-)private arrays failed.\n");
1406 return OFFLOAD_FAIL;
1407 }
1408 }
1409
1410 TgtPtrs.clear();
1411
1412 return OFFLOAD_SUCCESS;
1413 }
1414};
1415
1416/// Process data before launching the kernel, including calling targetDataBegin
1417/// to map and transfer data to target device, transferring (first-)private
1418/// variables.
1419static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1420 int32_t ArgNum, void **ArgBases, void **Args,
1421 int64_t *ArgSizes, int64_t *ArgTypes,
1422 map_var_info_t *ArgNames, void **ArgMappers,
1423 SmallVector<void *> &TgtArgs,
1424 SmallVector<ptrdiff_t> &TgtOffsets,
1425 PrivateArgumentManagerTy &PrivateArgumentManager,
1426 AsyncInfoTy &AsyncInfo) {
1427
1428 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceId);
1429 if (!DeviceOrErr)
1430 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1431
1432 int Ret = targetDataBegin(Loc, Device&: *DeviceOrErr, ArgNum, ArgsBase: ArgBases, Args, ArgSizes,
1433 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1434 if (Ret != OFFLOAD_SUCCESS) {
1435 REPORT("Call to targetDataBegin failed, abort target.\n");
1436 return OFFLOAD_FAIL;
1437 }
1438
1439 // List of (first-)private arrays allocated for this target region
1440 SmallVector<int> TgtArgsPositions(ArgNum, -1);
1441
1442 for (int32_t I = 0; I < ArgNum; ++I) {
1443 if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
1444 // This is not a target parameter, do not push it into TgtArgs.
1445 // Check for lambda mapping.
1446 if (isLambdaMapping(Mapping: ArgTypes[I])) {
1447 assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1448 "PTR_AND_OBJ must be also MEMBER_OF.");
1449 unsigned Idx = getParentIndex(Type: ArgTypes[I]);
1450 int TgtIdx = TgtArgsPositions[Idx];
1451 assert(TgtIdx != -1 && "Base address must be translated already.");
1452 // The parent lambda must be processed already and it must be the last
1453 // in TgtArgs and TgtOffsets arrays.
1454 void *HstPtrVal = Args[I];
1455 void *HstPtrBegin = ArgBases[I];
1456 void *HstPtrBase = Args[Idx];
1457 void *TgtPtrBase =
1458 (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]);
1459 DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
1460 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1461 void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
1462 void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation();
1463 TargetPointerResultTy TPR =
1464 DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1465 HstPtrBegin: HstPtrVal, Size: ArgSizes[I], /*UpdateRefCount=*/false,
1466 /*UseHoldRefCount=*/false);
1467 PointerTgtPtrBegin = TPR.TargetPointer;
1468 if (!TPR.isPresent()) {
1469 DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
1470 DPxPTR(HstPtrVal));
1471 continue;
1472 }
1473 if (TPR.Flags.IsHostPointer) {
1474 DP("Unified memory is active, no need to map lambda captured"
1475 "variable (" DPxMOD ")\n",
1476 DPxPTR(HstPtrVal));
1477 continue;
1478 }
1479 DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
1480 DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
1481 Ret =
1482 DeviceOrErr->submitData(TgtPtrBegin, HstPtrBegin: &PointerTgtPtrBegin,
1483 Size: sizeof(void *), AsyncInfo, Entry: TPR.getEntry());
1484 if (Ret != OFFLOAD_SUCCESS) {
1485 REPORT("Copying data to device failed.\n");
1486 return OFFLOAD_FAIL;
1487 }
1488 }
1489 continue;
1490 }
1491 void *HstPtrBegin = Args[I];
1492 void *HstPtrBase = ArgBases[I];
1493 void *TgtPtrBegin;
1494 map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I];
1495 ptrdiff_t TgtBaseOffset;
1496 TargetPointerResultTy TPR;
1497 if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) {
1498 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
1499 DPxPTR(HstPtrBase));
1500 TgtPtrBegin = HstPtrBase;
1501 TgtBaseOffset = 0;
1502 } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) {
1503 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1504 const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO);
1505 // If there is a next argument and it depends on the current one, we need
1506 // to allocate the private memory immediately. If this is not the case,
1507 // then the argument can be marked for optimization and packed with the
1508 // other privates.
1509 const bool AllocImmediately =
1510 (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF));
1511 Ret = PrivateArgumentManager.addArg(
1512 HstPtr: HstPtrBegin, ArgSize: ArgSizes[I], ArgOffset: TgtBaseOffset, IsFirstPrivate, TgtPtr&: TgtPtrBegin,
1513 TgtArgsIndex: TgtArgs.size(), HstPtrName, AllocImmediately);
1514 if (Ret != OFFLOAD_SUCCESS) {
1515 REPORT("Failed to process %sprivate argument " DPxMOD "\n",
1516 (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin));
1517 return OFFLOAD_FAIL;
1518 }
1519 } else {
1520 if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1521 HstPtrBase = *reinterpret_cast<void **>(HstPtrBase);
1522 TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin(
1523 HstPtrBegin, Size: ArgSizes[I],
1524 /*UpdateRefCount=*/false,
1525 /*UseHoldRefCount=*/false);
1526 TgtPtrBegin = TPR.TargetPointer;
1527 TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
1528#ifdef OMPTARGET_DEBUG
1529 void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
1530 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
1531 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
1532#endif
1533 }
1534 TgtArgsPositions[I] = TgtArgs.size();
1535 TgtArgs.push_back(Elt: TgtPtrBegin);
1536 TgtOffsets.push_back(Elt: TgtBaseOffset);
1537 }
1538
1539 assert(TgtArgs.size() == TgtOffsets.size() &&
1540 "Size mismatch in arguments and offsets");
1541
1542 // Pack and transfer first-private arguments
1543 Ret = PrivateArgumentManager.packAndTransfer(TgtArgs);
1544 if (Ret != OFFLOAD_SUCCESS) {
1545 DP("Failed to pack and transfer first private arguments\n");
1546 return OFFLOAD_FAIL;
1547 }
1548
1549 return OFFLOAD_SUCCESS;
1550}
1551
1552/// Process data after launching the kernel, including transferring data back to
1553/// host if needed and deallocating target memory of (first-)private variables.
1554static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr,
1555 int32_t ArgNum, void **ArgBases, void **Args,
1556 int64_t *ArgSizes, int64_t *ArgTypes,
1557 map_var_info_t *ArgNames, void **ArgMappers,
1558 PrivateArgumentManagerTy &PrivateArgumentManager,
1559 AsyncInfoTy &AsyncInfo) {
1560
1561 auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceId);
1562 if (!DeviceOrErr)
1563 FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
1564
1565 // Move data from device.
1566 int Ret = targetDataEnd(Loc, Device&: *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes,
1567 ArgTypes, ArgNames, ArgMappers, AsyncInfo);
1568 if (Ret != OFFLOAD_SUCCESS) {
1569 REPORT("Call to targetDataEnd failed, abort target.\n");
1570 return OFFLOAD_FAIL;
1571 }
1572
1573 // Free target memory for private arguments after synchronization.
1574 // TODO: We might want to remove `mutable` in the future by not changing the
1575 // captured variables somehow.
1576 AsyncInfo.addPostProcessingFunction(
1577 Function: [PrivateArgumentManager =
1578 std::move(PrivateArgumentManager)]() mutable -> int {
1579 int Ret = PrivateArgumentManager.free();
1580 if (Ret != OFFLOAD_SUCCESS) {
1581 REPORT("Failed to deallocate target memory for private args\n");
1582 return OFFLOAD_FAIL;
1583 }
1584 return Ret;
1585 });
1586
1587 return OFFLOAD_SUCCESS;
1588}
1589} // namespace
1590
1591/// performs the same actions as data_begin in case arg_num is
1592/// non-zero and initiates run of the offloaded region on the target platform;
1593/// if arg_num is non-zero after the region execution is done it also
1594/// performs the same action as data_update and data_end above. This function
1595/// returns 0 if it was able to transfer the execution to a target and an
1596/// integer different from zero otherwise.
1597int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1598 KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) {
1599 int32_t DeviceId = Device.DeviceID;
1600 TableMap *TM = getTableMap(HostPtr);
1601 // No map for this host pointer found!
1602 if (!TM) {
1603 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1604 DPxPTR(HostPtr));
1605 return OFFLOAD_FAIL;
1606 }
1607
1608 // get target table.
1609 __tgt_target_table *TargetTable = nullptr;
1610 {
1611 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1612 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1613 "Not expecting a device ID outside the table's bounds!");
1614 TargetTable = TM->Table->TargetsTable[DeviceId];
1615 }
1616 assert(TargetTable && "Global data has not been mapped\n");
1617
1618 DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount);
1619
1620 // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we
1621 // need to manifest base pointers prior to launching a kernel. Even if we have
1622 // mapped an object only partially, e.g. A[N:M], although the kernel is
1623 // expected to access elements starting at address &A[N] and beyond, we still
1624 // need to manifest the base of the array &A[0]. In other cases, e.g. the COI
1625 // API, we need the begin address itself, i.e. &A[N], as the API operates on
1626 // begin addresses, not bases. That's why we pass args and offsets as two
1627 // separate entities so that each plugin can do what it needs. This behavior
1628 // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c.
1629 SmallVector<void *> TgtArgs;
1630 SmallVector<ptrdiff_t> TgtOffsets;
1631
1632 PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo);
1633
1634 int NumClangLaunchArgs = KernelArgs.NumArgs;
1635 int Ret = OFFLOAD_SUCCESS;
1636 if (NumClangLaunchArgs) {
1637 // Process data, such as data mapping, before launching the kernel
1638 Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum: NumClangLaunchArgs,
1639 ArgBases: KernelArgs.ArgBasePtrs, Args: KernelArgs.ArgPtrs,
1640 ArgSizes: KernelArgs.ArgSizes, ArgTypes: KernelArgs.ArgTypes,
1641 ArgNames: KernelArgs.ArgNames, ArgMappers: KernelArgs.ArgMappers, TgtArgs,
1642 TgtOffsets, PrivateArgumentManager, AsyncInfo);
1643 if (Ret != OFFLOAD_SUCCESS) {
1644 REPORT("Failed to process data before launching the kernel.\n");
1645 return OFFLOAD_FAIL;
1646 }
1647
1648 // Clang might pass more values via the ArgPtrs to the runtime that we pass
1649 // on to the kernel.
1650 // TOOD: Next time we adjust the KernelArgsTy we should introduce a new
1651 // NumKernelArgs field.
1652 KernelArgs.NumArgs = TgtArgs.size();
1653 }
1654
1655 // Launch device execution.
1656 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1657 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1658 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1659
1660 {
1661 assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!");
1662 TIMESCOPE_WITH_DETAILS_AND_IDENT(
1663 "Kernel Target",
1664 "NumArguments=" + std::to_string(KernelArgs.NumArgs) +
1665 ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) +
1666 ";TripCount=" + std::to_string(KernelArgs.Tripcount),
1667 Loc);
1668
1669#ifdef OMPT_SUPPORT
1670 assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 &&
1671 "Multi dimensional launch not supported yet.");
1672 /// RAII to establish tool anchors before and after kernel launch
1673 int32_t NumTeams = KernelArgs.NumTeams[0];
1674 // No need to guard this with OMPT_IF_BUILT
1675 InterfaceRAII TargetSubmitRAII(
1676 RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
1677#endif
1678
1679 Ret = Device.launchKernel(TgtEntryPtr, TgtVarsPtr: TgtArgs.data(), TgtOffsets: TgtOffsets.data(),
1680 KernelArgs, AsyncInfo);
1681 }
1682
1683 if (Ret != OFFLOAD_SUCCESS) {
1684 REPORT("Executing target region abort target.\n");
1685 return OFFLOAD_FAIL;
1686 }
1687
1688 if (NumClangLaunchArgs) {
1689 // Transfer data back and deallocate target memory for (first-)private
1690 // variables
1691 Ret = processDataAfter(Loc, DeviceId, HostPtr, ArgNum: NumClangLaunchArgs,
1692 ArgBases: KernelArgs.ArgBasePtrs, Args: KernelArgs.ArgPtrs,
1693 ArgSizes: KernelArgs.ArgSizes, ArgTypes: KernelArgs.ArgTypes,
1694 ArgNames: KernelArgs.ArgNames, ArgMappers: KernelArgs.ArgMappers,
1695 PrivateArgumentManager, AsyncInfo);
1696 if (Ret != OFFLOAD_SUCCESS) {
1697 REPORT("Failed to process data after launching the kernel.\n");
1698 return OFFLOAD_FAIL;
1699 }
1700 }
1701
1702 return OFFLOAD_SUCCESS;
1703}
1704
1705/// Enables the record replay mechanism by pre-allocating MemorySize
1706/// and informing the record-replayer of whether to store the output
1707/// in some file.
1708int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
1709 bool IsRecord, bool SaveOutput,
1710 uint64_t &ReqPtrArgOffset) {
1711 return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
1712 VAddr, IsRecord, SaveOutput,
1713 ReqPtrArgOffset);
1714}
1715
1716/// Executes a kernel using pre-recorded information for loading to
1717/// device memory to launch the target kernel with the pre-recorded
1718/// configuration.
1719int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
1720 void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
1721 ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
1722 int32_t ThreadLimit, uint64_t LoopTripCount,
1723 AsyncInfoTy &AsyncInfo) {
1724 int32_t DeviceId = Device.DeviceID;
1725 TableMap *TM = getTableMap(HostPtr);
1726 // Fail if the table map fails to find the target kernel pointer for the
1727 // provided host pointer.
1728 if (!TM) {
1729 REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",
1730 DPxPTR(HostPtr));
1731 return OFFLOAD_FAIL;
1732 }
1733
1734 // Retrieve the target table of offloading entries.
1735 __tgt_target_table *TargetTable = nullptr;
1736 {
1737 std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
1738 assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
1739 "Not expecting a device ID outside the table's bounds!");
1740 TargetTable = TM->Table->TargetsTable[DeviceId];
1741 }
1742 assert(TargetTable && "Global data has not been mapped\n");
1743
1744 // Retrieve the target kernel pointer, allocate and store the recorded device
1745 // memory data, and launch device execution.
1746 void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr;
1747 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
1748 TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index);
1749
1750 void *TgtPtr = Device.allocData(Size: DeviceMemorySize, /*HstPtr=*/nullptr,
1751 Kind: TARGET_ALLOC_DEFAULT);
1752 Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: DeviceMemory, Size: DeviceMemorySize, AsyncInfo);
1753
1754 KernelArgsTy KernelArgs = {.Version: 0};
1755 KernelArgs.Version = 2;
1756 KernelArgs.NumArgs = NumArgs;
1757 KernelArgs.Tripcount = LoopTripCount;
1758 KernelArgs.NumTeams[0] = NumTeams;
1759 KernelArgs.ThreadLimit[0] = ThreadLimit;
1760
1761 int Ret = Device.launchKernel(TgtEntryPtr, TgtVarsPtr: TgtArgs, TgtOffsets, KernelArgs,
1762 AsyncInfo);
1763
1764 if (Ret != OFFLOAD_SUCCESS) {
1765 REPORT("Executing target region abort target.\n");
1766 return OFFLOAD_FAIL;
1767 }
1768
1769 return OFFLOAD_SUCCESS;
1770}
1771

source code of openmp/libomptarget/src/omptarget.cpp