1 | //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implementation of the interface to be used by Clang during the codegen of a |
10 | // target region. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "omptarget.h" |
15 | #include "OffloadPolicy.h" |
16 | #include "OpenMP/OMPT/Callback.h" |
17 | #include "OpenMP/OMPT/Interface.h" |
18 | #include "PluginManager.h" |
19 | #include "Shared/Debug.h" |
20 | #include "Shared/EnvironmentVar.h" |
21 | #include "Shared/Utils.h" |
22 | #include "device.h" |
23 | #include "private.h" |
24 | #include "rtl.h" |
25 | |
26 | #include "Shared/Profile.h" |
27 | |
28 | #include "OpenMP/Mapping.h" |
29 | #include "OpenMP/omp.h" |
30 | |
31 | #include "llvm/ADT/StringExtras.h" |
32 | #include "llvm/ADT/bit.h" |
33 | #include "llvm/Object/ObjectFile.h" |
34 | |
35 | #include <cassert> |
36 | #include <cstdint> |
37 | #include <vector> |
38 | |
39 | using llvm::SmallVector; |
40 | #ifdef OMPT_SUPPORT |
41 | using namespace llvm::omp::target::ompt; |
42 | #endif |
43 | |
44 | int AsyncInfoTy::synchronize() { |
45 | int Result = OFFLOAD_SUCCESS; |
46 | if (!isQueueEmpty()) { |
47 | switch (SyncType) { |
48 | case SyncTy::BLOCKING: |
49 | // If we have a queue we need to synchronize it now. |
50 | Result = Device.synchronize(AsyncInfo&: *this); |
51 | assert(AsyncInfo.Queue == nullptr && |
52 | "The device plugin should have nulled the queue to indicate there " |
53 | "are no outstanding actions!" ); |
54 | break; |
55 | case SyncTy::NON_BLOCKING: |
56 | Result = Device.queryAsync(AsyncInfo&: *this); |
57 | break; |
58 | } |
59 | } |
60 | |
61 | // Run any pending post-processing function registered on this async object. |
62 | if (Result == OFFLOAD_SUCCESS && isQueueEmpty()) |
63 | Result = runPostProcessing(); |
64 | |
65 | return Result; |
66 | } |
67 | |
68 | void *&AsyncInfoTy::getVoidPtrLocation() { |
69 | BufferLocations.push_back(x: nullptr); |
70 | return BufferLocations.back(); |
71 | } |
72 | |
73 | bool AsyncInfoTy::isDone() const { return isQueueEmpty(); } |
74 | |
75 | int32_t AsyncInfoTy::runPostProcessing() { |
76 | size_t Size = PostProcessingFunctions.size(); |
77 | for (size_t I = 0; I < Size; ++I) { |
78 | const int Result = PostProcessingFunctions[I](); |
79 | if (Result != OFFLOAD_SUCCESS) |
80 | return Result; |
81 | } |
82 | |
83 | // Clear the vector up until the last known function, since post-processing |
84 | // procedures might add new procedures themselves. |
85 | const auto *PrevBegin = PostProcessingFunctions.begin(); |
86 | PostProcessingFunctions.erase(CS: PrevBegin, CE: PrevBegin + Size); |
87 | |
88 | return OFFLOAD_SUCCESS; |
89 | } |
90 | |
91 | bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } |
92 | |
93 | /* All begin addresses for partially mapped structs must be aligned, up to 16, |
94 | * in order to ensure proper alignment of members. E.g. |
95 | * |
96 | * struct S { |
97 | * int a; // 4-aligned |
98 | * int b; // 4-aligned |
99 | * int *p; // 8-aligned |
100 | * } s1; |
101 | * ... |
102 | * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) |
103 | * { |
104 | * s1.b = 5; |
105 | * for (int i...) s1.p[i] = ...; |
106 | * } |
107 | * |
108 | * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and |
109 | * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, |
110 | * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment |
111 | * requirements for its type. Now, when we allocate memory on the device, in |
112 | * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. |
113 | * This means that the chunk of the struct on the device will start at a |
114 | * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and |
115 | * address of p will be a misaligned 0x204 (on the host there was no need to add |
116 | * padding between b and p, so p comes exactly 4 bytes after b). If the device |
117 | * kernel tries to access s1.p, a misaligned address error occurs (as reported |
118 | * by the CUDA plugin). By padding the begin address down to a multiple of 8 and |
119 | * extending the size of the allocated chuck accordingly, the chuck on the |
120 | * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and |
121 | * &s1.p=0x208, as they should be to satisfy the alignment requirements. |
122 | */ |
123 | static const int64_t MaxAlignment = 16; |
124 | |
125 | /// Return the alignment requirement of partially mapped structs, see |
126 | /// MaxAlignment above. |
127 | static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) { |
128 | int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase)); |
129 | uint64_t BaseAlignment = 1 << (LowestOneBit - 1); |
130 | return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment; |
131 | } |
132 | |
133 | /// Map global data and execute pending ctors |
134 | static int initLibrary(DeviceTy &Device) { |
135 | /* |
136 | * Map global data |
137 | */ |
138 | int32_t DeviceId = Device.DeviceID; |
139 | int Rc = OFFLOAD_SUCCESS; |
140 | { |
141 | std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); |
142 | for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { |
143 | TranslationTable *TransTable = |
144 | &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; |
145 | if (TransTable->HostTable.EntriesBegin == |
146 | TransTable->HostTable.EntriesEnd) { |
147 | // No host entry so no need to proceed |
148 | continue; |
149 | } |
150 | |
151 | if (TransTable->TargetsTable[DeviceId] != 0) { |
152 | // Library entries have already been processed |
153 | continue; |
154 | } |
155 | |
156 | // 1) get image. |
157 | assert(TransTable->TargetsImages.size() > (size_t)DeviceId && |
158 | "Not expecting a device ID outside the table's bounds!" ); |
159 | __tgt_device_image *Img = TransTable->TargetsImages[DeviceId]; |
160 | if (!Img) { |
161 | REPORT("No image loaded for device id %d.\n" , DeviceId); |
162 | Rc = OFFLOAD_FAIL; |
163 | break; |
164 | } |
165 | |
166 | // 2) Load the image onto the given device. |
167 | auto BinaryOrErr = Device.loadBinary(Img); |
168 | if (llvm::Error Err = BinaryOrErr.takeError()) { |
169 | REPORT("Failed to load image %s\n" , |
170 | llvm::toString(std::move(Err)).c_str()); |
171 | Rc = OFFLOAD_FAIL; |
172 | break; |
173 | } |
174 | |
175 | // 3) Create the translation table. |
176 | llvm::SmallVector<__tgt_offload_entry> &DeviceEntries = |
177 | TransTable->TargetsEntries[DeviceId]; |
178 | for (__tgt_offload_entry &Entry : |
179 | llvm::make_range(x: Img->EntriesBegin, y: Img->EntriesEnd)) { |
180 | __tgt_device_binary &Binary = *BinaryOrErr; |
181 | |
182 | __tgt_offload_entry DeviceEntry = Entry; |
183 | if (Entry.size) { |
184 | if (Device.RTL->get_global(Binary, Entry.size, Entry.name, |
185 | &DeviceEntry.addr) != OFFLOAD_SUCCESS) |
186 | REPORT("Failed to load symbol %s\n" , Entry.name); |
187 | |
188 | // If unified memory is active, the corresponding global is a device |
189 | // reference to the host global. We need to initialize the pointer on |
190 | // the deive to point to the memory on the host. |
191 | if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) || |
192 | (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) { |
193 | if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr, |
194 | Entry.size) != OFFLOAD_SUCCESS) |
195 | REPORT("Failed to write symbol for USM %s\n" , Entry.name); |
196 | } |
197 | } else { |
198 | if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) != |
199 | OFFLOAD_SUCCESS) |
200 | REPORT("Failed to load kernel %s\n" , Entry.name); |
201 | } |
202 | DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n" , |
203 | DPxPTR(Entry.addr), (Entry.size) ? " global" : "" , Entry.name, |
204 | DPxPTR(DeviceEntry.addr)); |
205 | |
206 | DeviceEntries.emplace_back(Args&: DeviceEntry); |
207 | } |
208 | |
209 | // Set the storage for the table and get a pointer to it. |
210 | __tgt_target_table DeviceTable{.EntriesBegin: &DeviceEntries[0], |
211 | .EntriesEnd: &DeviceEntries[0] + DeviceEntries.size()}; |
212 | TransTable->DeviceTables[DeviceId] = DeviceTable; |
213 | __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = |
214 | &TransTable->DeviceTables[DeviceId]; |
215 | |
216 | // 4) Verify whether the two table sizes match. |
217 | size_t Hsize = |
218 | TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; |
219 | size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; |
220 | |
221 | // Invalid image for these host entries! |
222 | if (Hsize != Tsize) { |
223 | REPORT( |
224 | "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , |
225 | DeviceId, Hsize, Tsize); |
226 | TransTable->TargetsImages[DeviceId] = 0; |
227 | TransTable->TargetsTable[DeviceId] = 0; |
228 | Rc = OFFLOAD_FAIL; |
229 | break; |
230 | } |
231 | |
232 | MappingInfoTy::HDTTMapAccessorTy HDTTMap = |
233 | Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); |
234 | |
235 | __tgt_target_table *HostTable = &TransTable->HostTable; |
236 | for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, |
237 | *CurrHostEntry = HostTable->EntriesBegin, |
238 | *EntryDeviceEnd = TargetTable->EntriesEnd; |
239 | CurrDeviceEntry != EntryDeviceEnd; |
240 | CurrDeviceEntry++, CurrHostEntry++) { |
241 | if (CurrDeviceEntry->size == 0) |
242 | continue; |
243 | |
244 | assert(CurrDeviceEntry->size == CurrHostEntry->size && |
245 | "data size mismatch" ); |
246 | |
247 | // Fortran may use multiple weak declarations for the same symbol, |
248 | // therefore we must allow for multiple weak symbols to be loaded from |
249 | // the fat binary. Treat these mappings as any other "regular" |
250 | // mapping. Add entry to map. |
251 | if (Device.getMappingInfo().getTgtPtrBegin(HDTTMap, HstPtrBegin: CurrHostEntry->addr, |
252 | Size: CurrHostEntry->size)) |
253 | continue; |
254 | |
255 | void *CurrDeviceEntryAddr = CurrDeviceEntry->addr; |
256 | |
257 | // For indirect mapping, follow the indirection and map the actual |
258 | // target. |
259 | if (CurrDeviceEntry->flags & OMP_DECLARE_TARGET_INDIRECT) { |
260 | AsyncInfoTy AsyncInfo(Device); |
261 | void *DevPtr; |
262 | Device.retrieveData(HstPtrBegin: &DevPtr, TgtPtrBegin: CurrDeviceEntryAddr, Size: sizeof(void *), |
263 | AsyncInfo, /*Entry=*/nullptr, HDTTMapPtr: &HDTTMap); |
264 | if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS) |
265 | return OFFLOAD_FAIL; |
266 | CurrDeviceEntryAddr = DevPtr; |
267 | } |
268 | |
269 | DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" |
270 | ", name \"%s\"\n" , |
271 | DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), |
272 | CurrDeviceEntry->size, CurrDeviceEntry->name); |
273 | HDTTMap->emplace(args: new HostDataToTargetTy( |
274 | (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, |
275 | (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, |
276 | (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, |
277 | (uintptr_t)CurrDeviceEntryAddr /*TgtAllocBegin*/, |
278 | (uintptr_t)CurrDeviceEntryAddr /*TgtPtrBegin*/, |
279 | false /*UseHoldRefCount*/, CurrHostEntry->name, |
280 | true /*IsRefCountINF*/)); |
281 | |
282 | // Notify about the new mapping. |
283 | if (Device.notifyDataMapped(HstPtr: CurrHostEntry->addr, Size: CurrHostEntry->size)) |
284 | return OFFLOAD_FAIL; |
285 | } |
286 | } |
287 | } |
288 | |
289 | if (Rc != OFFLOAD_SUCCESS) |
290 | return Rc; |
291 | |
292 | static Int32Envar DumpOffloadEntries = |
293 | Int32Envar("OMPTARGET_DUMP_OFFLOAD_ENTRIES" , -1); |
294 | if (DumpOffloadEntries.get() == DeviceId) |
295 | Device.dumpOffloadEntries(); |
296 | |
297 | return OFFLOAD_SUCCESS; |
298 | } |
299 | |
300 | void handleTargetOutcome(bool Success, ident_t *Loc) { |
301 | switch (OffloadPolicy::get(PM&: *PM).Kind) { |
302 | case OffloadPolicy::DISABLED: |
303 | if (Success) { |
304 | FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled" ); |
305 | } |
306 | break; |
307 | case OffloadPolicy::MANDATORY: |
308 | if (!Success) { |
309 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { |
310 | auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); |
311 | for (auto &Device : PM->devices(DevicesAccessor&: ExclusiveDevicesAccessor)) |
312 | dumpTargetPointerMappings(Loc, Device); |
313 | } else |
314 | FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " |
315 | "for debugging options.\n" ); |
316 | |
317 | if (!PM->getNumUsedPlugins()) { |
318 | FAILURE_MESSAGE( |
319 | "No images found compatible with the installed hardware. " ); |
320 | |
321 | llvm::SmallVector<llvm::StringRef> Archs; |
322 | for (auto &Image : PM->deviceImages()) { |
323 | const char *Start = reinterpret_cast<const char *>( |
324 | Image.getExecutableImage().ImageStart); |
325 | uint64_t Length = llvm::omp::target::getPtrDiff( |
326 | End: Start, Begin: Image.getExecutableImage().ImageEnd); |
327 | llvm::MemoryBufferRef Buffer(llvm::StringRef(Start, Length), |
328 | /*Identifier=*/"" ); |
329 | |
330 | auto ObjectOrErr = llvm::object::ObjectFile::createObjectFile(Object: Buffer); |
331 | if (auto Err = ObjectOrErr.takeError()) { |
332 | llvm::consumeError(Err: std::move(Err)); |
333 | continue; |
334 | } |
335 | |
336 | if (auto CPU = (*ObjectOrErr)->tryGetCPUName()) |
337 | Archs.push_back(Elt: *CPU); |
338 | } |
339 | fprintf(stderr, format: "Found %zu image(s): (%s)\n" , Archs.size(), |
340 | llvm::join(R&: Archs, Separator: "," ).c_str()); |
341 | } |
342 | |
343 | SourceInfo Info(Loc); |
344 | if (Info.isAvailible()) |
345 | fprintf(stderr, format: "%s:%d:%d: " , Info.getFilename(), Info.getLine(), |
346 | Info.getColumn()); |
347 | else |
348 | FAILURE_MESSAGE("Source location information not present. Compile with " |
349 | "-g or -gline-tables-only.\n" ); |
350 | FATAL_MESSAGE0( |
351 | 1, "failure of target construct while offloading is mandatory" ); |
352 | } else { |
353 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) { |
354 | auto ExclusiveDevicesAccessor = PM->getExclusiveDevicesAccessor(); |
355 | for (auto &Device : PM->devices(DevicesAccessor&: ExclusiveDevicesAccessor)) |
356 | dumpTargetPointerMappings(Loc, Device); |
357 | } |
358 | } |
359 | break; |
360 | } |
361 | } |
362 | |
363 | // If offload is enabled, ensure that device DeviceID has been initialized, |
364 | // global ctors have been executed, and global data has been mapped. |
365 | // |
366 | // The return bool indicates if the offload is to the host device |
367 | // There are three possible results: |
368 | // - Return false if the taregt device is ready for offload |
369 | // - Return true without reporting a runtime error if offload is |
370 | // disabled, perhaps because the initial device was specified. |
371 | // - Report a runtime error and return true. |
372 | // |
373 | // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. |
374 | // This step might be skipped if offload is disabled. |
375 | bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { |
376 | if (OffloadPolicy::get(PM&: *PM).Kind == OffloadPolicy::DISABLED) { |
377 | DP("Offload is disabled\n" ); |
378 | return true; |
379 | } |
380 | |
381 | if (DeviceID == OFFLOAD_DEVICE_DEFAULT) { |
382 | DeviceID = omp_get_default_device(); |
383 | DP("Use default device id %" PRId64 "\n" , DeviceID); |
384 | } |
385 | |
386 | // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. |
387 | if (omp_get_num_devices() == 0) { |
388 | DP("omp_get_num_devices() == 0 but offload is manadatory\n" ); |
389 | handleTargetOutcome(Success: false, Loc); |
390 | return true; |
391 | } |
392 | |
393 | if (DeviceID == omp_get_initial_device()) { |
394 | DP("Device is host (%" PRId64 "), returning as if offload is disabled\n" , |
395 | DeviceID); |
396 | return true; |
397 | } |
398 | |
399 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceID); |
400 | if (!DeviceOrErr) |
401 | FATAL_MESSAGE(DeviceID, "%s" , toString(DeviceOrErr.takeError()).data()); |
402 | |
403 | // Check whether global data has been mapped for this device |
404 | if (initLibrary(Device&: *DeviceOrErr) != OFFLOAD_SUCCESS) { |
405 | REPORT("Failed to init globals on device %" PRId64 "\n" , DeviceID); |
406 | handleTargetOutcome(Success: false, Loc); |
407 | return true; |
408 | } |
409 | |
410 | return false; |
411 | } |
412 | |
413 | static int32_t getParentIndex(int64_t Type) { |
414 | return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; |
415 | } |
416 | |
417 | void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, |
418 | const char *Name) { |
419 | DP("Call to %s for device %d requesting %zu bytes\n" , Name, DeviceNum, Size); |
420 | |
421 | if (Size <= 0) { |
422 | DP("Call to %s with non-positive length\n" , Name); |
423 | return NULL; |
424 | } |
425 | |
426 | void *Rc = NULL; |
427 | |
428 | if (DeviceNum == omp_get_initial_device()) { |
429 | Rc = malloc(size: Size); |
430 | DP("%s returns host ptr " DPxMOD "\n" , Name, DPxPTR(Rc)); |
431 | return Rc; |
432 | } |
433 | |
434 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum); |
435 | if (!DeviceOrErr) |
436 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
437 | |
438 | Rc = DeviceOrErr->allocData(Size, HstPtr: nullptr, Kind); |
439 | DP("%s returns device ptr " DPxMOD "\n" , Name, DPxPTR(Rc)); |
440 | return Rc; |
441 | } |
442 | |
443 | void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, |
444 | const char *Name) { |
445 | DP("Call to %s for device %d and address " DPxMOD "\n" , Name, DeviceNum, |
446 | DPxPTR(DevicePtr)); |
447 | |
448 | if (!DevicePtr) { |
449 | DP("Call to %s with NULL ptr\n" , Name); |
450 | return; |
451 | } |
452 | |
453 | if (DeviceNum == omp_get_initial_device()) { |
454 | free(ptr: DevicePtr); |
455 | DP("%s deallocated host ptr\n" , Name); |
456 | return; |
457 | } |
458 | |
459 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum); |
460 | if (!DeviceOrErr) |
461 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
462 | |
463 | DeviceOrErr->deleteData(TgtPtrBegin: DevicePtr, Kind); |
464 | DP("omp_target_free deallocated device ptr\n" ); |
465 | } |
466 | |
467 | void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, |
468 | const char *Name) { |
469 | DP("Call to %s for device %d locking %zu bytes\n" , Name, DeviceNum, Size); |
470 | |
471 | if (Size <= 0) { |
472 | DP("Call to %s with non-positive length\n" , Name); |
473 | return NULL; |
474 | } |
475 | |
476 | void *RC = NULL; |
477 | |
478 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum); |
479 | if (!DeviceOrErr) |
480 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
481 | |
482 | int32_t Err = 0; |
483 | if (!DeviceOrErr->RTL->data_lock) { |
484 | Err = DeviceOrErr->RTL->data_lock(DeviceNum, HostPtr, Size, &RC); |
485 | if (Err) { |
486 | DP("Could not lock ptr %p\n" , HostPtr); |
487 | return nullptr; |
488 | } |
489 | } |
490 | DP("%s returns device ptr " DPxMOD "\n" , Name, DPxPTR(RC)); |
491 | return RC; |
492 | } |
493 | |
494 | void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { |
495 | DP("Call to %s for device %d unlocking\n" , Name, DeviceNum); |
496 | |
497 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceNum); |
498 | if (!DeviceOrErr) |
499 | FATAL_MESSAGE(DeviceNum, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
500 | |
501 | if (!DeviceOrErr->RTL->data_unlock) |
502 | DeviceOrErr->RTL->data_unlock(DeviceNum, HostPtr); |
503 | |
504 | DP("%s returns\n" , Name); |
505 | } |
506 | |
507 | /// Call the user-defined mapper function followed by the appropriate |
508 | // targetData* function (targetData{Begin,End,Update}). |
509 | int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, |
510 | int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, |
511 | void *ArgMapper, AsyncInfoTy &AsyncInfo, |
512 | TargetDataFuncPtrTy TargetDataFunction) { |
513 | DP("Calling the mapper function " DPxMOD "\n" , DPxPTR(ArgMapper)); |
514 | |
515 | // The mapper function fills up Components. |
516 | MapperComponentsTy MapperComponents; |
517 | MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); |
518 | (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, |
519 | ArgNames); |
520 | |
521 | // Construct new arrays for args_base, args, arg_sizes and arg_types |
522 | // using the information in MapperComponents and call the corresponding |
523 | // targetData* function using these new arrays. |
524 | SmallVector<void *> MapperArgsBase(MapperComponents.Components.size()); |
525 | SmallVector<void *> MapperArgs(MapperComponents.Components.size()); |
526 | SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size()); |
527 | SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size()); |
528 | SmallVector<void *> MapperArgNames(MapperComponents.Components.size()); |
529 | |
530 | for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { |
531 | auto &C = MapperComponents.Components[I]; |
532 | MapperArgsBase[I] = C.Base; |
533 | MapperArgs[I] = C.Begin; |
534 | MapperArgSizes[I] = C.Size; |
535 | MapperArgTypes[I] = C.Type; |
536 | MapperArgNames[I] = C.Name; |
537 | } |
538 | |
539 | int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), |
540 | MapperArgsBase.data(), MapperArgs.data(), |
541 | MapperArgSizes.data(), MapperArgTypes.data(), |
542 | MapperArgNames.data(), /*arg_mappers*/ nullptr, |
543 | AsyncInfo, /*FromMapper=*/true); |
544 | |
545 | return Rc; |
546 | } |
547 | |
548 | /// Internal function to do the mapping and transfer the data to the device |
549 | int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
550 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
551 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
552 | void **ArgMappers, AsyncInfoTy &AsyncInfo, |
553 | bool FromMapper) { |
554 | // process each input. |
555 | for (int32_t I = 0; I < ArgNum; ++I) { |
556 | // Ignore private variables and arrays - there is no mapping for them. |
557 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
558 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
559 | continue; |
560 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
561 | "HostToDev" , "Size=" + std::to_string(ArgSizes[I]) + "B" , Loc); |
562 | if (ArgMappers && ArgMappers[I]) { |
563 | // Instead of executing the regular path of targetDataBegin, call the |
564 | // targetDataMapper variant which will call targetDataBegin again |
565 | // with new arguments. |
566 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
567 | |
568 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
569 | int Rc = targetDataMapper(Loc, Device, ArgBase: ArgsBase[I], Arg: Args[I], ArgSize: ArgSizes[I], |
570 | ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo, |
571 | TargetDataFunction: targetDataBegin); |
572 | |
573 | if (Rc != OFFLOAD_SUCCESS) { |
574 | REPORT("Call to targetDataBegin via targetDataMapper for custom mapper" |
575 | " failed.\n" ); |
576 | return OFFLOAD_FAIL; |
577 | } |
578 | |
579 | // Skip the rest of this function, continue to the next argument. |
580 | continue; |
581 | } |
582 | |
583 | void *HstPtrBegin = Args[I]; |
584 | void *HstPtrBase = ArgsBase[I]; |
585 | int64_t DataSize = ArgSizes[I]; |
586 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
587 | |
588 | // Adjust for proper alignment if this is a combined entry (for structs). |
589 | // Look at the next argument - if that is MEMBER_OF this one, then this one |
590 | // is a combined entry. |
591 | int64_t TgtPadding = 0; |
592 | const int NextI = I + 1; |
593 | if (getParentIndex(Type: ArgTypes[I]) < 0 && NextI < ArgNum && |
594 | getParentIndex(Type: ArgTypes[NextI]) == I) { |
595 | int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase); |
596 | TgtPadding = (int64_t)HstPtrBegin % Alignment; |
597 | if (TgtPadding) { |
598 | DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD |
599 | "\n" , |
600 | TgtPadding, DPxPTR(HstPtrBegin)); |
601 | } |
602 | } |
603 | |
604 | // Address of pointer on the host and device, respectively. |
605 | void *PointerHstPtrBegin, *PointerTgtPtrBegin; |
606 | TargetPointerResultTy PointerTpr; |
607 | bool IsHostPtr = false; |
608 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
609 | // Force the creation of a device side copy of the data when: |
610 | // a close map modifier was associated with a map that contained a to. |
611 | bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; |
612 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
613 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
614 | // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we |
615 | // have reached this point via __tgt_target_data_begin and not __tgt_target |
616 | // then no argument is marked as TARGET_PARAM ("omp target data map" is not |
617 | // associated with a target region, so there are no target parameters). This |
618 | // may be considered a hack, we could revise the scheme in the future. |
619 | bool UpdateRef = |
620 | !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); |
621 | |
622 | MappingInfoTy::HDTTMapAccessorTy HDTTMap = |
623 | Device.getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); |
624 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { |
625 | DP("Has a pointer entry: \n" ); |
626 | // Base is address of pointer. |
627 | // |
628 | // Usually, the pointer is already allocated by this time. For example: |
629 | // |
630 | // #pragma omp target map(s.p[0:N]) |
631 | // |
632 | // The map entry for s comes first, and the PTR_AND_OBJ entry comes |
633 | // afterward, so the pointer is already allocated by the time the |
634 | // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus |
635 | // non-null. However, "declare target link" can produce a PTR_AND_OBJ |
636 | // entry for a global that might not already be allocated by the time the |
637 | // PTR_AND_OBJ entry is handled below, and so the allocation might fail |
638 | // when HasPresentModifier. |
639 | PointerTpr = Device.getMappingInfo().getTargetPointer( |
640 | HDTTMap, HstPtrBegin: HstPtrBase, HstPtrBase, /*TgtPadding=*/0, Size: sizeof(void *), |
641 | /*HstPtrName=*/nullptr, |
642 | /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRefCount: UpdateRef, |
643 | HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, |
644 | /*OwnedTPR=*/nullptr, /*ReleaseHDTTMap=*/false); |
645 | PointerTgtPtrBegin = PointerTpr.TargetPointer; |
646 | IsHostPtr = PointerTpr.Flags.IsHostPointer; |
647 | if (!PointerTgtPtrBegin) { |
648 | REPORT("Call to getTargetPointer returned null pointer (%s).\n" , |
649 | HasPresentModifier ? "'present' map type modifier" |
650 | : "device failure or illegal mapping" ); |
651 | return OFFLOAD_FAIL; |
652 | } |
653 | DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" |
654 | "\n" , |
655 | sizeof(void *), DPxPTR(PointerTgtPtrBegin), |
656 | (PointerTpr.Flags.IsNewEntry ? "" : " not" )); |
657 | PointerHstPtrBegin = HstPtrBase; |
658 | // modify current entry. |
659 | HstPtrBase = *(void **)HstPtrBase; |
660 | // No need to update pointee ref count for the first element of the |
661 | // subelement that comes from mapper. |
662 | UpdateRef = |
663 | (!FromMapper || I != 0); // subsequently update ref count of pointee |
664 | } |
665 | |
666 | const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; |
667 | const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
668 | // Note that HDTTMap will be released in getTargetPointer. |
669 | auto TPR = Device.getMappingInfo().getTargetPointer( |
670 | HDTTMap, HstPtrBegin, HstPtrBase, TgtPadding, Size: DataSize, HstPtrName, |
671 | HasFlagTo, HasFlagAlways, IsImplicit, UpdateRefCount: UpdateRef, HasCloseModifier, |
672 | HasPresentModifier, HasHoldModifier, AsyncInfo, OwnedTPR: PointerTpr.getEntry()); |
673 | void *TgtPtrBegin = TPR.TargetPointer; |
674 | IsHostPtr = TPR.Flags.IsHostPointer; |
675 | // If data_size==0, then the argument could be a zero-length pointer to |
676 | // NULL, so getOrAlloc() returning NULL is not an error. |
677 | if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { |
678 | REPORT("Call to getTargetPointer returned null pointer (%s).\n" , |
679 | HasPresentModifier ? "'present' map type modifier" |
680 | : "device failure or illegal mapping" ); |
681 | return OFFLOAD_FAIL; |
682 | } |
683 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD |
684 | " - is%s new\n" , |
685 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not" )); |
686 | |
687 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { |
688 | uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; |
689 | void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); |
690 | DP("Returning device pointer " DPxMOD "\n" , DPxPTR(TgtPtrBase)); |
691 | ArgsBase[I] = TgtPtrBase; |
692 | } |
693 | |
694 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { |
695 | |
696 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
697 | void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); |
698 | |
699 | if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfo: ShadowPtrInfoTy{ |
700 | .HstPtrAddr: (void **)PointerHstPtrBegin, .HstPtrVal: HstPtrBase, |
701 | .TgtPtrAddr: (void **)PointerTgtPtrBegin, .TgtPtrVal: ExpectedTgtPtrBase})) { |
702 | DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n" , |
703 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); |
704 | |
705 | void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); |
706 | TgtPtrBase = ExpectedTgtPtrBase; |
707 | |
708 | int Ret = |
709 | Device.submitData(TgtPtrBegin: PointerTgtPtrBegin, HstPtrBegin: &TgtPtrBase, Size: sizeof(void *), |
710 | AsyncInfo, Entry: PointerTpr.getEntry()); |
711 | if (Ret != OFFLOAD_SUCCESS) { |
712 | REPORT("Copying data to device failed.\n" ); |
713 | return OFFLOAD_FAIL; |
714 | } |
715 | if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != |
716 | OFFLOAD_SUCCESS) |
717 | return OFFLOAD_FAIL; |
718 | } |
719 | } |
720 | |
721 | // Check if variable can be used on the device: |
722 | bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF; |
723 | if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 && |
724 | !IsStructMember && !IsImplicit && !TPR.isPresent() && |
725 | !TPR.isContained() && !TPR.isHostPointer()) |
726 | INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID, |
727 | "variable %s does not have a valid device counterpart\n" , |
728 | (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown" ); |
729 | } |
730 | |
731 | return OFFLOAD_SUCCESS; |
732 | } |
733 | |
734 | namespace { |
735 | /// This structure contains information to deallocate a target pointer, aka. |
736 | /// used to fix up the shadow map and potentially delete the entry from the |
737 | /// mapping table via \p DeviceTy::deallocTgtPtr. |
738 | struct PostProcessingInfo { |
739 | /// Host pointer used to look up into the map table |
740 | void *HstPtrBegin; |
741 | |
742 | /// Size of the data |
743 | int64_t DataSize; |
744 | |
745 | /// The mapping type (bitfield). |
746 | int64_t ArgType; |
747 | |
748 | /// The target pointer information. |
749 | TargetPointerResultTy TPR; |
750 | |
751 | PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, |
752 | TargetPointerResultTy &&TPR) |
753 | : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), |
754 | TPR(std::move(TPR)) {} |
755 | }; |
756 | |
757 | } // namespace |
758 | |
759 | /// Applies the necessary post-processing procedures to entries listed in \p |
760 | /// EntriesInfo after the execution of all device side operations from a target |
761 | /// data end. This includes the update of pointers at the host and removal of |
762 | /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS |
763 | /// according to the successfulness of the operations. |
764 | [[nodiscard]] static int |
765 | postProcessingTargetDataEnd(DeviceTy *Device, |
766 | SmallVector<PostProcessingInfo> &EntriesInfo) { |
767 | int Ret = OFFLOAD_SUCCESS; |
768 | |
769 | for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) { |
770 | bool DelEntry = !TPR.isHostPointer(); |
771 | |
772 | // If the last element from the mapper (for end transfer args comes in |
773 | // reverse order), do not remove the partial entry, the parent struct still |
774 | // exists. |
775 | if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && |
776 | !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { |
777 | DelEntry = false; // protect parent struct from being deallocated |
778 | } |
779 | |
780 | // If we marked the entry to be deleted we need to verify no other |
781 | // thread reused it by now. If deletion is still supposed to happen by |
782 | // this thread LR will be set and exclusive access to the HDTT map |
783 | // will avoid another thread reusing the entry now. Note that we do |
784 | // not request (exclusive) access to the HDTT map if DelEntry is |
785 | // not set. |
786 | MappingInfoTy::HDTTMapAccessorTy HDTTMap = |
787 | Device->getMappingInfo().HostDataToTargetMap.getExclusiveAccessor(); |
788 | |
789 | // We cannot use a lock guard because we may end up delete the mutex. |
790 | // We also explicitly unlocked the entry after it was put in the EntriesInfo |
791 | // so it can be reused. |
792 | TPR.getEntry()->lock(); |
793 | auto *Entry = TPR.getEntry(); |
794 | |
795 | const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0; |
796 | if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) { |
797 | // The thread is not in charge of deletion anymore. Give up access |
798 | // to the HDTT map and unset the deletion flag. |
799 | HDTTMap.destroy(); |
800 | DelEntry = false; |
801 | } |
802 | |
803 | // If we copied back to the host a struct/array containing pointers, |
804 | // we need to restore the original host pointer values from their |
805 | // shadow copies. If the struct is going to be deallocated, remove any |
806 | // remaining shadow pointer entries for this struct. |
807 | const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; |
808 | if (HasFrom) { |
809 | Entry->foreachShadowPointerInfo(CB: [&](const ShadowPtrInfoTy &ShadowPtr) { |
810 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; |
811 | DP("Restoring original host pointer value " DPxMOD " for host " |
812 | "pointer " DPxMOD "\n" , |
813 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); |
814 | return OFFLOAD_SUCCESS; |
815 | }); |
816 | } |
817 | |
818 | // Give up the lock as we either don't need it anymore (e.g., done with |
819 | // TPR), or erase TPR. |
820 | TPR.setEntry(HDTTT: nullptr); |
821 | |
822 | if (!DelEntry) |
823 | continue; |
824 | |
825 | Ret = Device->getMappingInfo().eraseMapEntry(HDTTMap, Entry, Size: DataSize); |
826 | // Entry is already remove from the map, we can unlock it now. |
827 | HDTTMap.destroy(); |
828 | Ret |= Device->getMappingInfo().deallocTgtPtrAndEntry(Entry, Size: DataSize); |
829 | if (Ret != OFFLOAD_SUCCESS) { |
830 | REPORT("Deallocating data from device failed.\n" ); |
831 | break; |
832 | } |
833 | } |
834 | |
835 | delete &EntriesInfo; |
836 | return Ret; |
837 | } |
838 | |
839 | /// Internal function to undo the mapping and retrieve the data from the device. |
840 | int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
841 | void **ArgBases, void **Args, int64_t *ArgSizes, |
842 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
843 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { |
844 | int Ret = OFFLOAD_SUCCESS; |
845 | auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); |
846 | // process each input. |
847 | for (int32_t I = ArgNum - 1; I >= 0; --I) { |
848 | // Ignore private variables and arrays - there is no mapping for them. |
849 | // Also, ignore the use_device_ptr directive, it has no effect here. |
850 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
851 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
852 | continue; |
853 | |
854 | if (ArgMappers && ArgMappers[I]) { |
855 | // Instead of executing the regular path of targetDataEnd, call the |
856 | // targetDataMapper variant which will call targetDataEnd again |
857 | // with new arguments. |
858 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
859 | |
860 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
861 | Ret = targetDataMapper(Loc, Device, ArgBase: ArgBases[I], Arg: Args[I], ArgSize: ArgSizes[I], |
862 | ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo, |
863 | TargetDataFunction: targetDataEnd); |
864 | |
865 | if (Ret != OFFLOAD_SUCCESS) { |
866 | REPORT("Call to targetDataEnd via targetDataMapper for custom mapper" |
867 | " failed.\n" ); |
868 | return OFFLOAD_FAIL; |
869 | } |
870 | |
871 | // Skip the rest of this function, continue to the next argument. |
872 | continue; |
873 | } |
874 | |
875 | void *HstPtrBegin = Args[I]; |
876 | int64_t DataSize = ArgSizes[I]; |
877 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
878 | bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || |
879 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && |
880 | !(FromMapper && I == 0); |
881 | bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; |
882 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
883 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
884 | |
885 | // If PTR_AND_OBJ, HstPtrBegin is address of pointee |
886 | TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( |
887 | HstPtrBegin, Size: DataSize, UpdateRefCount: UpdateRef, UseHoldRefCount: HasHoldModifier, MustContain: !IsImplicit, |
888 | ForceDelete, /*FromDataEnd=*/true); |
889 | void *TgtPtrBegin = TPR.TargetPointer; |
890 | if (!TPR.isPresent() && !TPR.isHostPointer() && |
891 | (DataSize || HasPresentModifier)) { |
892 | DP("Mapping does not exist (%s)\n" , |
893 | (HasPresentModifier ? "'present' map type modifier" : "ignored" )); |
894 | if (HasPresentModifier) { |
895 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: |
896 | // "If a map clause appears on a target, target data, target enter data |
897 | // or target exit data construct with a present map-type-modifier then |
898 | // on entry to the region if the corresponding list item does not appear |
899 | // in the device data environment then an error occurs and the program |
900 | // terminates." |
901 | // |
902 | // This should be an error upon entering an "omp target exit data". It |
903 | // should not be an error upon exiting an "omp target data" or "omp |
904 | // target". For "omp target data", Clang thus doesn't include present |
905 | // modifiers for end calls. For "omp target", we have not found a valid |
906 | // OpenMP program for which the error matters: it appears that, if a |
907 | // program can guarantee that data is present at the beginning of an |
908 | // "omp target" region so that there's no error there, that data is also |
909 | // guaranteed to be present at the end. |
910 | MESSAGE("device mapping required by 'present' map type modifier does " |
911 | "not exist for host address " DPxMOD " (%" PRId64 " bytes)" , |
912 | DPxPTR(HstPtrBegin), DataSize); |
913 | return OFFLOAD_FAIL; |
914 | } |
915 | } else { |
916 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD |
917 | " - is%s last\n" , |
918 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not" )); |
919 | } |
920 | |
921 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: |
922 | // "If the map clause appears on a target, target data, or target exit data |
923 | // construct and a corresponding list item of the original list item is not |
924 | // present in the device data environment on exit from the region then the |
925 | // list item is ignored." |
926 | if (!TPR.isPresent()) |
927 | continue; |
928 | |
929 | // Move data back to the host |
930 | const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
931 | const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; |
932 | if (HasFrom && (HasAlways || TPR.Flags.IsLast) && |
933 | !TPR.Flags.IsHostPointer && DataSize != 0) { |
934 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n" , |
935 | DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); |
936 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
937 | "DevToHost" , "Size=" + std::to_string(DataSize) + "B" , Loc); |
938 | // Wait for any previous transfer if an event is present. |
939 | if (void *Event = TPR.getEntry()->getEvent()) { |
940 | if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) { |
941 | REPORT("Failed to wait for event " DPxMOD ".\n" , DPxPTR(Event)); |
942 | return OFFLOAD_FAIL; |
943 | } |
944 | } |
945 | |
946 | Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, Size: DataSize, AsyncInfo, |
947 | Entry: TPR.getEntry()); |
948 | if (Ret != OFFLOAD_SUCCESS) { |
949 | REPORT("Copying data from device failed.\n" ); |
950 | return OFFLOAD_FAIL; |
951 | } |
952 | |
953 | // As we are expecting to delete the entry the d2h copy might race |
954 | // with another one that also tries to delete the entry. This happens |
955 | // as the entry can be reused and the reuse might happen after the |
956 | // copy-back was issued but before it completed. Since the reuse might |
957 | // also copy-back a value we would race. |
958 | if (TPR.Flags.IsLast) { |
959 | if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != |
960 | OFFLOAD_SUCCESS) |
961 | return OFFLOAD_FAIL; |
962 | } |
963 | } |
964 | |
965 | // Add pointer to the buffer for post-synchronize processing. |
966 | PostProcessingPtrs->emplace_back(Args&: HstPtrBegin, Args&: DataSize, Args&: ArgTypes[I], |
967 | Args: std::move(TPR)); |
968 | PostProcessingPtrs->back().TPR.getEntry()->unlock(); |
969 | } |
970 | |
971 | // Add post-processing functions |
972 | // TODO: We might want to remove `mutable` in the future by not changing the |
973 | // captured variables somehow. |
974 | AsyncInfo.addPostProcessingFunction(Function: [=, Device = &Device]() mutable -> int { |
975 | return postProcessingTargetDataEnd(Device, EntriesInfo&: *PostProcessingPtrs); |
976 | }); |
977 | |
978 | return Ret; |
979 | } |
980 | |
981 | static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, |
982 | void *HstPtrBegin, int64_t ArgSize, |
983 | int64_t ArgType, AsyncInfoTy &AsyncInfo) { |
984 | TargetPointerResultTy TPR = Device.getMappingInfo().getTgtPtrBegin( |
985 | HstPtrBegin, Size: ArgSize, /*UpdateRefCount=*/false, |
986 | /*UseHoldRefCount=*/false, /*MustContain=*/true); |
987 | void *TgtPtrBegin = TPR.TargetPointer; |
988 | if (!TPR.isPresent()) { |
989 | DP("hst data:" DPxMOD " not found, becomes a noop\n" , DPxPTR(HstPtrBegin)); |
990 | if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { |
991 | MESSAGE("device mapping required by 'present' motion modifier does not " |
992 | "exist for host address " DPxMOD " (%" PRId64 " bytes)" , |
993 | DPxPTR(HstPtrBegin), ArgSize); |
994 | return OFFLOAD_FAIL; |
995 | } |
996 | return OFFLOAD_SUCCESS; |
997 | } |
998 | |
999 | if (TPR.Flags.IsHostPointer) { |
1000 | DP("hst data:" DPxMOD " unified and shared, becomes a noop\n" , |
1001 | DPxPTR(HstPtrBegin)); |
1002 | return OFFLOAD_SUCCESS; |
1003 | } |
1004 | |
1005 | if (ArgType & OMP_TGT_MAPTYPE_TO) { |
1006 | DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n" , |
1007 | ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); |
1008 | int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, Size: ArgSize, AsyncInfo, |
1009 | Entry: TPR.getEntry()); |
1010 | if (Ret != OFFLOAD_SUCCESS) { |
1011 | REPORT("Copying data to device failed.\n" ); |
1012 | return OFFLOAD_FAIL; |
1013 | } |
1014 | if (TPR.getEntry()) { |
1015 | int Ret = TPR.getEntry()->foreachShadowPointerInfo( |
1016 | CB: [&](ShadowPtrInfoTy &ShadowPtr) { |
1017 | DP("Restoring original target pointer value " DPxMOD " for target " |
1018 | "pointer " DPxMOD "\n" , |
1019 | DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)); |
1020 | Ret = Device.submitData(TgtPtrBegin: ShadowPtr.TgtPtrAddr, |
1021 | HstPtrBegin: (void *)&ShadowPtr.TgtPtrVal, |
1022 | Size: sizeof(void *), AsyncInfo); |
1023 | if (Ret != OFFLOAD_SUCCESS) { |
1024 | REPORT("Copying data to device failed.\n" ); |
1025 | return OFFLOAD_FAIL; |
1026 | } |
1027 | return OFFLOAD_SUCCESS; |
1028 | }); |
1029 | if (Ret != OFFLOAD_SUCCESS) { |
1030 | DP("Updating shadow map failed\n" ); |
1031 | return Ret; |
1032 | } |
1033 | } |
1034 | } |
1035 | |
1036 | if (ArgType & OMP_TGT_MAPTYPE_FROM) { |
1037 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n" , |
1038 | ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); |
1039 | int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, Size: ArgSize, AsyncInfo, |
1040 | Entry: TPR.getEntry()); |
1041 | if (Ret != OFFLOAD_SUCCESS) { |
1042 | REPORT("Copying data from device failed.\n" ); |
1043 | return OFFLOAD_FAIL; |
1044 | } |
1045 | |
1046 | // Wait for device-to-host memcopies for whole struct to complete, |
1047 | // before restoring the correct host pointer. |
1048 | if (auto *Entry = TPR.getEntry()) { |
1049 | AsyncInfo.addPostProcessingFunction(Function: [=]() -> int { |
1050 | int Ret = Entry->foreachShadowPointerInfo( |
1051 | CB: [&](const ShadowPtrInfoTy &ShadowPtr) { |
1052 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; |
1053 | DP("Restoring original host pointer value " DPxMOD |
1054 | " for host pointer " DPxMOD "\n" , |
1055 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)); |
1056 | return OFFLOAD_SUCCESS; |
1057 | }); |
1058 | Entry->unlock(); |
1059 | if (Ret != OFFLOAD_SUCCESS) { |
1060 | DP("Updating shadow map failed\n" ); |
1061 | return Ret; |
1062 | } |
1063 | return OFFLOAD_SUCCESS; |
1064 | }); |
1065 | } |
1066 | } |
1067 | |
1068 | return OFFLOAD_SUCCESS; |
1069 | } |
1070 | |
1071 | static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, |
1072 | void *ArgsBase, |
1073 | __tgt_target_non_contig *NonContig, |
1074 | uint64_t Size, int64_t ArgType, |
1075 | int CurrentDim, int DimSize, uint64_t Offset, |
1076 | AsyncInfoTy &AsyncInfo) { |
1077 | int Ret = OFFLOAD_SUCCESS; |
1078 | if (CurrentDim < DimSize) { |
1079 | for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { |
1080 | uint64_t CurOffset = |
1081 | (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; |
1082 | // we only need to transfer the first element for the last dimension |
1083 | // since we've already got a contiguous piece. |
1084 | if (CurrentDim != DimSize - 1 || I == 0) { |
1085 | Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, |
1086 | ArgType, CurrentDim: CurrentDim + 1, DimSize, |
1087 | Offset: Offset + CurOffset, AsyncInfo); |
1088 | // Stop the whole process if any contiguous piece returns anything |
1089 | // other than OFFLOAD_SUCCESS. |
1090 | if (Ret != OFFLOAD_SUCCESS) |
1091 | return Ret; |
1092 | } |
1093 | } |
1094 | } else { |
1095 | char *Ptr = (char *)ArgsBase + Offset; |
1096 | DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64 |
1097 | " len %" PRIu64 "\n" , |
1098 | DPxPTR(Ptr), Offset, Size); |
1099 | Ret = targetDataContiguous(Loc, Device, ArgsBase, HstPtrBegin: Ptr, ArgSize: Size, ArgType, |
1100 | AsyncInfo); |
1101 | } |
1102 | return Ret; |
1103 | } |
1104 | |
1105 | static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, |
1106 | int32_t DimSize) { |
1107 | int RemovedDim = 0; |
1108 | for (int I = DimSize - 1; I > 0; --I) { |
1109 | if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) |
1110 | RemovedDim++; |
1111 | } |
1112 | return RemovedDim; |
1113 | } |
1114 | |
1115 | /// Internal function to pass data to/from the target. |
1116 | int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
1117 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
1118 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
1119 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { |
1120 | // process each input. |
1121 | for (int32_t I = 0; I < ArgNum; ++I) { |
1122 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
1123 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
1124 | continue; |
1125 | |
1126 | if (ArgMappers && ArgMappers[I]) { |
1127 | // Instead of executing the regular path of targetDataUpdate, call the |
1128 | // targetDataMapper variant which will call targetDataUpdate again |
1129 | // with new arguments. |
1130 | DP("Calling targetDataMapper for the %dth argument\n" , I); |
1131 | |
1132 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
1133 | int Ret = targetDataMapper(Loc, Device, ArgBase: ArgsBase[I], Arg: Args[I], ArgSize: ArgSizes[I], |
1134 | ArgType: ArgTypes[I], ArgNames: ArgName, ArgMapper: ArgMappers[I], AsyncInfo, |
1135 | TargetDataFunction: targetDataUpdate); |
1136 | |
1137 | if (Ret != OFFLOAD_SUCCESS) { |
1138 | REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper" |
1139 | " failed.\n" ); |
1140 | return OFFLOAD_FAIL; |
1141 | } |
1142 | |
1143 | // Skip the rest of this function, continue to the next argument. |
1144 | continue; |
1145 | } |
1146 | |
1147 | int Ret = OFFLOAD_SUCCESS; |
1148 | |
1149 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { |
1150 | __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; |
1151 | int32_t DimSize = ArgSizes[I]; |
1152 | uint64_t Size = |
1153 | NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; |
1154 | int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); |
1155 | Ret = targetDataNonContiguous( |
1156 | Loc, Device, ArgsBase: ArgsBase[I], NonContig, Size, ArgType: ArgTypes[I], |
1157 | /*current_dim=*/CurrentDim: 0, DimSize: DimSize - MergedDim, /*offset=*/Offset: 0, AsyncInfo); |
1158 | } else { |
1159 | Ret = targetDataContiguous(Loc, Device, ArgsBase: ArgsBase[I], HstPtrBegin: Args[I], ArgSize: ArgSizes[I], |
1160 | ArgType: ArgTypes[I], AsyncInfo); |
1161 | } |
1162 | if (Ret == OFFLOAD_FAIL) |
1163 | return OFFLOAD_FAIL; |
1164 | } |
1165 | return OFFLOAD_SUCCESS; |
1166 | } |
1167 | |
1168 | static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | |
1169 | OMP_TGT_MAPTYPE_LITERAL | |
1170 | OMP_TGT_MAPTYPE_IMPLICIT; |
1171 | static bool isLambdaMapping(int64_t Mapping) { |
1172 | return (Mapping & LambdaMapping) == LambdaMapping; |
1173 | } |
1174 | |
1175 | namespace { |
1176 | /// Find the table information in the map or look it up in the translation |
1177 | /// tables. |
1178 | TableMap *getTableMap(void *HostPtr) { |
1179 | std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); |
1180 | HostPtrToTableMapTy::iterator TableMapIt = |
1181 | PM->HostPtrToTableMap.find(x: HostPtr); |
1182 | |
1183 | if (TableMapIt != PM->HostPtrToTableMap.end()) |
1184 | return &TableMapIt->second; |
1185 | |
1186 | // We don't have a map. So search all the registered libraries. |
1187 | TableMap *TM = nullptr; |
1188 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1189 | for (HostEntriesBeginToTransTableTy::iterator Itr = |
1190 | PM->HostEntriesBeginToTransTable.begin(); |
1191 | Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { |
1192 | // get the translation table (which contains all the good info). |
1193 | TranslationTable *TransTable = &Itr->second; |
1194 | // iterate over all the host table entries to see if we can locate the |
1195 | // host_ptr. |
1196 | __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; |
1197 | for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { |
1198 | if (Cur->addr != HostPtr) |
1199 | continue; |
1200 | // we got a match, now fill the HostPtrToTableMap so that we |
1201 | // may avoid this search next time. |
1202 | TM = &(PM->HostPtrToTableMap)[HostPtr]; |
1203 | TM->Table = TransTable; |
1204 | TM->Index = I; |
1205 | return TM; |
1206 | } |
1207 | } |
1208 | |
1209 | return nullptr; |
1210 | } |
1211 | |
1212 | /// A class manages private arguments in a target region. |
1213 | class PrivateArgumentManagerTy { |
1214 | /// A data structure for the information of first-private arguments. We can |
1215 | /// use this information to optimize data transfer by packing all |
1216 | /// first-private arguments and transfer them all at once. |
1217 | struct FirstPrivateArgInfoTy { |
1218 | /// Host pointer begin |
1219 | char *HstPtrBegin; |
1220 | /// Host pointer end |
1221 | char *HstPtrEnd; |
1222 | /// The index of the element in \p TgtArgs corresponding to the argument |
1223 | int Index; |
1224 | /// Alignment of the entry (base of the entry, not after the entry). |
1225 | uint32_t Alignment; |
1226 | /// Size (without alignment, see padding) |
1227 | uint32_t Size; |
1228 | /// Padding used to align this argument entry, if necessary. |
1229 | uint32_t Padding; |
1230 | /// Host pointer name |
1231 | map_var_info_t HstPtrName = nullptr; |
1232 | |
1233 | FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, |
1234 | uint32_t Alignment, uint32_t Padding, |
1235 | map_var_info_t HstPtrName = nullptr) |
1236 | : HstPtrBegin(reinterpret_cast<char *>(HstPtr)), |
1237 | HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), |
1238 | Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} |
1239 | }; |
1240 | |
1241 | /// A vector of target pointers for all private arguments |
1242 | SmallVector<void *> TgtPtrs; |
1243 | |
1244 | /// A vector of information of all first-private arguments to be packed |
1245 | SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; |
1246 | /// Host buffer for all arguments to be packed |
1247 | SmallVector<char> FirstPrivateArgBuffer; |
1248 | /// The total size of all arguments to be packed |
1249 | int64_t FirstPrivateArgSize = 0; |
1250 | |
1251 | /// A reference to the \p DeviceTy object |
1252 | DeviceTy &Device; |
1253 | /// A pointer to a \p AsyncInfoTy object |
1254 | AsyncInfoTy &AsyncInfo; |
1255 | |
1256 | // TODO: What would be the best value here? Should we make it configurable? |
1257 | // If the size is larger than this threshold, we will allocate and transfer it |
1258 | // immediately instead of packing it. |
1259 | static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; |
1260 | |
1261 | public: |
1262 | /// Constructor |
1263 | PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) |
1264 | : Device(Dev), AsyncInfo(AsyncInfo) {} |
1265 | |
1266 | /// Add a private argument |
1267 | int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, |
1268 | bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, |
1269 | map_var_info_t HstPtrName = nullptr, |
1270 | const bool AllocImmediately = false) { |
1271 | // If the argument is not first-private, or its size is greater than a |
1272 | // predefined threshold, we will allocate memory and issue the transfer |
1273 | // immediately. |
1274 | if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || |
1275 | AllocImmediately) { |
1276 | TgtPtr = Device.allocData(Size: ArgSize, HstPtr); |
1277 | if (!TgtPtr) { |
1278 | DP("Data allocation for %sprivate array " DPxMOD " failed.\n" , |
1279 | (IsFirstPrivate ? "first-" : "" ), DPxPTR(HstPtr)); |
1280 | return OFFLOAD_FAIL; |
1281 | } |
1282 | #ifdef OMPTARGET_DEBUG |
1283 | void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); |
1284 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD |
1285 | " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD |
1286 | "\n" , |
1287 | ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : "" ), |
1288 | DPxPTR(HstPtr), DPxPTR(TgtPtrBase)); |
1289 | #endif |
1290 | // If first-private, copy data from host |
1291 | if (IsFirstPrivate) { |
1292 | DP("Submitting firstprivate data to the device.\n" ); |
1293 | int Ret = Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: HstPtr, Size: ArgSize, AsyncInfo); |
1294 | if (Ret != OFFLOAD_SUCCESS) { |
1295 | DP("Copying data to device failed, failed.\n" ); |
1296 | return OFFLOAD_FAIL; |
1297 | } |
1298 | } |
1299 | TgtPtrs.push_back(Elt: TgtPtr); |
1300 | } else { |
1301 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n" , |
1302 | DPxPTR(HstPtr), ArgSize); |
1303 | // When reach this point, the argument must meet all following |
1304 | // requirements: |
1305 | // 1. Its size does not exceed the threshold (see the comment for |
1306 | // FirstPrivateArgSizeThreshold); |
1307 | // 2. It must be first-private (needs to be mapped to target device). |
1308 | // We will pack all this kind of arguments to transfer them all at once |
1309 | // to reduce the number of data transfer. We will not take |
1310 | // non-first-private arguments, aka. private arguments that doesn't need |
1311 | // to be mapped to target device, into account because data allocation |
1312 | // can be very efficient with memory manager. |
1313 | |
1314 | // Placeholder value |
1315 | TgtPtr = nullptr; |
1316 | auto *LastFPArgInfo = |
1317 | FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back(); |
1318 | |
1319 | // Compute the start alignment of this entry, add padding if necessary. |
1320 | // TODO: Consider sorting instead. |
1321 | uint32_t Padding = 0; |
1322 | uint32_t StartAlignment = |
1323 | LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment; |
1324 | if (LastFPArgInfo) { |
1325 | // Check if we keep the start alignment or if it is shrunk due to the |
1326 | // size of the last element. |
1327 | uint32_t Offset = LastFPArgInfo->Size % StartAlignment; |
1328 | if (Offset) |
1329 | StartAlignment = Offset; |
1330 | // We only need as much alignment as the host pointer had (since we |
1331 | // don't know the alignment information from the source we might end up |
1332 | // overaligning accesses but not too much). |
1333 | uint32_t RequiredAlignment = |
1334 | llvm::bit_floor(Value: getPartialStructRequiredAlignment(HstPtrBase: HstPtr)); |
1335 | if (RequiredAlignment > StartAlignment) { |
1336 | Padding = RequiredAlignment - StartAlignment; |
1337 | StartAlignment = RequiredAlignment; |
1338 | } |
1339 | } |
1340 | |
1341 | FirstPrivateArgInfo.emplace_back(Args&: TgtArgsIndex, Args&: HstPtr, Args&: ArgSize, |
1342 | Args&: StartAlignment, Args&: Padding, Args&: HstPtrName); |
1343 | FirstPrivateArgSize += Padding + ArgSize; |
1344 | } |
1345 | |
1346 | return OFFLOAD_SUCCESS; |
1347 | } |
1348 | |
1349 | /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, |
1350 | /// and start the transfer. |
1351 | int packAndTransfer(SmallVector<void *> &TgtArgs) { |
1352 | if (!FirstPrivateArgInfo.empty()) { |
1353 | assert(FirstPrivateArgSize != 0 && |
1354 | "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty" ); |
1355 | FirstPrivateArgBuffer.resize(N: FirstPrivateArgSize, NV: 0); |
1356 | auto *Itr = FirstPrivateArgBuffer.begin(); |
1357 | // Copy all host data to this buffer |
1358 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
1359 | // First pad the pointer as we (have to) pad it on the device too. |
1360 | Itr = std::next(x: Itr, n: Info.Padding); |
1361 | std::copy(first: Info.HstPtrBegin, last: Info.HstPtrEnd, result: Itr); |
1362 | Itr = std::next(x: Itr, n: Info.Size); |
1363 | } |
1364 | // Allocate target memory |
1365 | void *TgtPtr = |
1366 | Device.allocData(Size: FirstPrivateArgSize, HstPtr: FirstPrivateArgBuffer.data()); |
1367 | if (TgtPtr == nullptr) { |
1368 | DP("Failed to allocate target memory for private arguments.\n" ); |
1369 | return OFFLOAD_FAIL; |
1370 | } |
1371 | TgtPtrs.push_back(Elt: TgtPtr); |
1372 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n" , |
1373 | FirstPrivateArgSize, DPxPTR(TgtPtr)); |
1374 | // Transfer data to target device |
1375 | int Ret = Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: FirstPrivateArgBuffer.data(), |
1376 | Size: FirstPrivateArgSize, AsyncInfo); |
1377 | if (Ret != OFFLOAD_SUCCESS) { |
1378 | DP("Failed to submit data of private arguments.\n" ); |
1379 | return OFFLOAD_FAIL; |
1380 | } |
1381 | // Fill in all placeholder pointers |
1382 | auto TP = reinterpret_cast<uintptr_t>(TgtPtr); |
1383 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
1384 | void *&Ptr = TgtArgs[Info.Index]; |
1385 | assert(Ptr == nullptr && "Target pointer is already set by mistaken" ); |
1386 | // Pad the device pointer to get the right alignment. |
1387 | TP += Info.Padding; |
1388 | Ptr = reinterpret_cast<void *>(TP); |
1389 | TP += Info.Size; |
1390 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD |
1391 | "\n" , |
1392 | DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin, |
1393 | DPxPTR(Ptr)); |
1394 | } |
1395 | } |
1396 | |
1397 | return OFFLOAD_SUCCESS; |
1398 | } |
1399 | |
1400 | /// Free all target memory allocated for private arguments |
1401 | int free() { |
1402 | for (void *P : TgtPtrs) { |
1403 | int Ret = Device.deleteData(TgtPtrBegin: P); |
1404 | if (Ret != OFFLOAD_SUCCESS) { |
1405 | DP("Deallocation of (first-)private arrays failed.\n" ); |
1406 | return OFFLOAD_FAIL; |
1407 | } |
1408 | } |
1409 | |
1410 | TgtPtrs.clear(); |
1411 | |
1412 | return OFFLOAD_SUCCESS; |
1413 | } |
1414 | }; |
1415 | |
1416 | /// Process data before launching the kernel, including calling targetDataBegin |
1417 | /// to map and transfer data to target device, transferring (first-)private |
1418 | /// variables. |
1419 | static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
1420 | int32_t ArgNum, void **ArgBases, void **Args, |
1421 | int64_t *ArgSizes, int64_t *ArgTypes, |
1422 | map_var_info_t *ArgNames, void **ArgMappers, |
1423 | SmallVector<void *> &TgtArgs, |
1424 | SmallVector<ptrdiff_t> &TgtOffsets, |
1425 | PrivateArgumentManagerTy &PrivateArgumentManager, |
1426 | AsyncInfoTy &AsyncInfo) { |
1427 | |
1428 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceId); |
1429 | if (!DeviceOrErr) |
1430 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
1431 | |
1432 | int Ret = targetDataBegin(Loc, Device&: *DeviceOrErr, ArgNum, ArgsBase: ArgBases, Args, ArgSizes, |
1433 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
1434 | if (Ret != OFFLOAD_SUCCESS) { |
1435 | REPORT("Call to targetDataBegin failed, abort target.\n" ); |
1436 | return OFFLOAD_FAIL; |
1437 | } |
1438 | |
1439 | // List of (first-)private arrays allocated for this target region |
1440 | SmallVector<int> TgtArgsPositions(ArgNum, -1); |
1441 | |
1442 | for (int32_t I = 0; I < ArgNum; ++I) { |
1443 | if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { |
1444 | // This is not a target parameter, do not push it into TgtArgs. |
1445 | // Check for lambda mapping. |
1446 | if (isLambdaMapping(Mapping: ArgTypes[I])) { |
1447 | assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && |
1448 | "PTR_AND_OBJ must be also MEMBER_OF." ); |
1449 | unsigned Idx = getParentIndex(Type: ArgTypes[I]); |
1450 | int TgtIdx = TgtArgsPositions[Idx]; |
1451 | assert(TgtIdx != -1 && "Base address must be translated already." ); |
1452 | // The parent lambda must be processed already and it must be the last |
1453 | // in TgtArgs and TgtOffsets arrays. |
1454 | void *HstPtrVal = Args[I]; |
1455 | void *HstPtrBegin = ArgBases[I]; |
1456 | void *HstPtrBase = Args[Idx]; |
1457 | void *TgtPtrBase = |
1458 | (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); |
1459 | DP("Parent lambda base " DPxMOD "\n" , DPxPTR(TgtPtrBase)); |
1460 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
1461 | void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); |
1462 | void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); |
1463 | TargetPointerResultTy TPR = |
1464 | DeviceOrErr->getMappingInfo().getTgtPtrBegin( |
1465 | HstPtrBegin: HstPtrVal, Size: ArgSizes[I], /*UpdateRefCount=*/false, |
1466 | /*UseHoldRefCount=*/false); |
1467 | PointerTgtPtrBegin = TPR.TargetPointer; |
1468 | if (!TPR.isPresent()) { |
1469 | DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n" , |
1470 | DPxPTR(HstPtrVal)); |
1471 | continue; |
1472 | } |
1473 | if (TPR.Flags.IsHostPointer) { |
1474 | DP("Unified memory is active, no need to map lambda captured" |
1475 | "variable (" DPxMOD ")\n" , |
1476 | DPxPTR(HstPtrVal)); |
1477 | continue; |
1478 | } |
1479 | DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n" , |
1480 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)); |
1481 | Ret = |
1482 | DeviceOrErr->submitData(TgtPtrBegin, HstPtrBegin: &PointerTgtPtrBegin, |
1483 | Size: sizeof(void *), AsyncInfo, Entry: TPR.getEntry()); |
1484 | if (Ret != OFFLOAD_SUCCESS) { |
1485 | REPORT("Copying data to device failed.\n" ); |
1486 | return OFFLOAD_FAIL; |
1487 | } |
1488 | } |
1489 | continue; |
1490 | } |
1491 | void *HstPtrBegin = Args[I]; |
1492 | void *HstPtrBase = ArgBases[I]; |
1493 | void *TgtPtrBegin; |
1494 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
1495 | ptrdiff_t TgtBaseOffset; |
1496 | TargetPointerResultTy TPR; |
1497 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { |
1498 | DP("Forwarding first-private value " DPxMOD " to the target construct\n" , |
1499 | DPxPTR(HstPtrBase)); |
1500 | TgtPtrBegin = HstPtrBase; |
1501 | TgtBaseOffset = 0; |
1502 | } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { |
1503 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
1504 | const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); |
1505 | // If there is a next argument and it depends on the current one, we need |
1506 | // to allocate the private memory immediately. If this is not the case, |
1507 | // then the argument can be marked for optimization and packed with the |
1508 | // other privates. |
1509 | const bool AllocImmediately = |
1510 | (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); |
1511 | Ret = PrivateArgumentManager.addArg( |
1512 | HstPtr: HstPtrBegin, ArgSize: ArgSizes[I], ArgOffset: TgtBaseOffset, IsFirstPrivate, TgtPtr&: TgtPtrBegin, |
1513 | TgtArgsIndex: TgtArgs.size(), HstPtrName, AllocImmediately); |
1514 | if (Ret != OFFLOAD_SUCCESS) { |
1515 | REPORT("Failed to process %sprivate argument " DPxMOD "\n" , |
1516 | (IsFirstPrivate ? "first-" : "" ), DPxPTR(HstPtrBegin)); |
1517 | return OFFLOAD_FAIL; |
1518 | } |
1519 | } else { |
1520 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) |
1521 | HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); |
1522 | TPR = DeviceOrErr->getMappingInfo().getTgtPtrBegin( |
1523 | HstPtrBegin, Size: ArgSizes[I], |
1524 | /*UpdateRefCount=*/false, |
1525 | /*UseHoldRefCount=*/false); |
1526 | TgtPtrBegin = TPR.TargetPointer; |
1527 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
1528 | #ifdef OMPTARGET_DEBUG |
1529 | void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); |
1530 | DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n" , |
1531 | DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); |
1532 | #endif |
1533 | } |
1534 | TgtArgsPositions[I] = TgtArgs.size(); |
1535 | TgtArgs.push_back(Elt: TgtPtrBegin); |
1536 | TgtOffsets.push_back(Elt: TgtBaseOffset); |
1537 | } |
1538 | |
1539 | assert(TgtArgs.size() == TgtOffsets.size() && |
1540 | "Size mismatch in arguments and offsets" ); |
1541 | |
1542 | // Pack and transfer first-private arguments |
1543 | Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); |
1544 | if (Ret != OFFLOAD_SUCCESS) { |
1545 | DP("Failed to pack and transfer first private arguments\n" ); |
1546 | return OFFLOAD_FAIL; |
1547 | } |
1548 | |
1549 | return OFFLOAD_SUCCESS; |
1550 | } |
1551 | |
1552 | /// Process data after launching the kernel, including transferring data back to |
1553 | /// host if needed and deallocating target memory of (first-)private variables. |
1554 | static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
1555 | int32_t ArgNum, void **ArgBases, void **Args, |
1556 | int64_t *ArgSizes, int64_t *ArgTypes, |
1557 | map_var_info_t *ArgNames, void **ArgMappers, |
1558 | PrivateArgumentManagerTy &PrivateArgumentManager, |
1559 | AsyncInfoTy &AsyncInfo) { |
1560 | |
1561 | auto DeviceOrErr = PM->getDevice(DeviceNo: DeviceId); |
1562 | if (!DeviceOrErr) |
1563 | FATAL_MESSAGE(DeviceId, "%s" , toString(DeviceOrErr.takeError()).c_str()); |
1564 | |
1565 | // Move data from device. |
1566 | int Ret = targetDataEnd(Loc, Device&: *DeviceOrErr, ArgNum, ArgBases, Args, ArgSizes, |
1567 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
1568 | if (Ret != OFFLOAD_SUCCESS) { |
1569 | REPORT("Call to targetDataEnd failed, abort target.\n" ); |
1570 | return OFFLOAD_FAIL; |
1571 | } |
1572 | |
1573 | // Free target memory for private arguments after synchronization. |
1574 | // TODO: We might want to remove `mutable` in the future by not changing the |
1575 | // captured variables somehow. |
1576 | AsyncInfo.addPostProcessingFunction( |
1577 | Function: [PrivateArgumentManager = |
1578 | std::move(PrivateArgumentManager)]() mutable -> int { |
1579 | int Ret = PrivateArgumentManager.free(); |
1580 | if (Ret != OFFLOAD_SUCCESS) { |
1581 | REPORT("Failed to deallocate target memory for private args\n" ); |
1582 | return OFFLOAD_FAIL; |
1583 | } |
1584 | return Ret; |
1585 | }); |
1586 | |
1587 | return OFFLOAD_SUCCESS; |
1588 | } |
1589 | } // namespace |
1590 | |
1591 | /// performs the same actions as data_begin in case arg_num is |
1592 | /// non-zero and initiates run of the offloaded region on the target platform; |
1593 | /// if arg_num is non-zero after the region execution is done it also |
1594 | /// performs the same action as data_update and data_end above. This function |
1595 | /// returns 0 if it was able to transfer the execution to a target and an |
1596 | /// integer different from zero otherwise. |
1597 | int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
1598 | KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { |
1599 | int32_t DeviceId = Device.DeviceID; |
1600 | TableMap *TM = getTableMap(HostPtr); |
1601 | // No map for this host pointer found! |
1602 | if (!TM) { |
1603 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n" , |
1604 | DPxPTR(HostPtr)); |
1605 | return OFFLOAD_FAIL; |
1606 | } |
1607 | |
1608 | // get target table. |
1609 | __tgt_target_table *TargetTable = nullptr; |
1610 | { |
1611 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1612 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && |
1613 | "Not expecting a device ID outside the table's bounds!" ); |
1614 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
1615 | } |
1616 | assert(TargetTable && "Global data has not been mapped\n" ); |
1617 | |
1618 | DP("loop trip count is %" PRIu64 ".\n" , KernelArgs.Tripcount); |
1619 | |
1620 | // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we |
1621 | // need to manifest base pointers prior to launching a kernel. Even if we have |
1622 | // mapped an object only partially, e.g. A[N:M], although the kernel is |
1623 | // expected to access elements starting at address &A[N] and beyond, we still |
1624 | // need to manifest the base of the array &A[0]. In other cases, e.g. the COI |
1625 | // API, we need the begin address itself, i.e. &A[N], as the API operates on |
1626 | // begin addresses, not bases. That's why we pass args and offsets as two |
1627 | // separate entities so that each plugin can do what it needs. This behavior |
1628 | // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. |
1629 | SmallVector<void *> TgtArgs; |
1630 | SmallVector<ptrdiff_t> TgtOffsets; |
1631 | |
1632 | PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); |
1633 | |
1634 | int NumClangLaunchArgs = KernelArgs.NumArgs; |
1635 | int Ret = OFFLOAD_SUCCESS; |
1636 | if (NumClangLaunchArgs) { |
1637 | // Process data, such as data mapping, before launching the kernel |
1638 | Ret = processDataBefore(Loc, DeviceId, HostPtr, ArgNum: NumClangLaunchArgs, |
1639 | ArgBases: KernelArgs.ArgBasePtrs, Args: KernelArgs.ArgPtrs, |
1640 | ArgSizes: KernelArgs.ArgSizes, ArgTypes: KernelArgs.ArgTypes, |
1641 | ArgNames: KernelArgs.ArgNames, ArgMappers: KernelArgs.ArgMappers, TgtArgs, |
1642 | TgtOffsets, PrivateArgumentManager, AsyncInfo); |
1643 | if (Ret != OFFLOAD_SUCCESS) { |
1644 | REPORT("Failed to process data before launching the kernel.\n" ); |
1645 | return OFFLOAD_FAIL; |
1646 | } |
1647 | |
1648 | // Clang might pass more values via the ArgPtrs to the runtime that we pass |
1649 | // on to the kernel. |
1650 | // TOOD: Next time we adjust the KernelArgsTy we should introduce a new |
1651 | // NumKernelArgs field. |
1652 | KernelArgs.NumArgs = TgtArgs.size(); |
1653 | } |
1654 | |
1655 | // Launch device execution. |
1656 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; |
1657 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n" , |
1658 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); |
1659 | |
1660 | { |
1661 | assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!" ); |
1662 | TIMESCOPE_WITH_DETAILS_AND_IDENT( |
1663 | "Kernel Target" , |
1664 | "NumArguments=" + std::to_string(KernelArgs.NumArgs) + |
1665 | ";NumTeams=" + std::to_string(KernelArgs.NumTeams[0]) + |
1666 | ";TripCount=" + std::to_string(KernelArgs.Tripcount), |
1667 | Loc); |
1668 | |
1669 | #ifdef OMPT_SUPPORT |
1670 | assert(KernelArgs.NumTeams[1] == 0 && KernelArgs.NumTeams[2] == 0 && |
1671 | "Multi dimensional launch not supported yet." ); |
1672 | /// RAII to establish tool anchors before and after kernel launch |
1673 | int32_t NumTeams = KernelArgs.NumTeams[0]; |
1674 | // No need to guard this with OMPT_IF_BUILT |
1675 | InterfaceRAII TargetSubmitRAII( |
1676 | RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams); |
1677 | #endif |
1678 | |
1679 | Ret = Device.launchKernel(TgtEntryPtr, TgtVarsPtr: TgtArgs.data(), TgtOffsets: TgtOffsets.data(), |
1680 | KernelArgs, AsyncInfo); |
1681 | } |
1682 | |
1683 | if (Ret != OFFLOAD_SUCCESS) { |
1684 | REPORT("Executing target region abort target.\n" ); |
1685 | return OFFLOAD_FAIL; |
1686 | } |
1687 | |
1688 | if (NumClangLaunchArgs) { |
1689 | // Transfer data back and deallocate target memory for (first-)private |
1690 | // variables |
1691 | Ret = processDataAfter(Loc, DeviceId, HostPtr, ArgNum: NumClangLaunchArgs, |
1692 | ArgBases: KernelArgs.ArgBasePtrs, Args: KernelArgs.ArgPtrs, |
1693 | ArgSizes: KernelArgs.ArgSizes, ArgTypes: KernelArgs.ArgTypes, |
1694 | ArgNames: KernelArgs.ArgNames, ArgMappers: KernelArgs.ArgMappers, |
1695 | PrivateArgumentManager, AsyncInfo); |
1696 | if (Ret != OFFLOAD_SUCCESS) { |
1697 | REPORT("Failed to process data after launching the kernel.\n" ); |
1698 | return OFFLOAD_FAIL; |
1699 | } |
1700 | } |
1701 | |
1702 | return OFFLOAD_SUCCESS; |
1703 | } |
1704 | |
1705 | /// Enables the record replay mechanism by pre-allocating MemorySize |
1706 | /// and informing the record-replayer of whether to store the output |
1707 | /// in some file. |
1708 | int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr, |
1709 | bool IsRecord, bool SaveOutput, |
1710 | uint64_t &ReqPtrArgOffset) { |
1711 | return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize, |
1712 | VAddr, IsRecord, SaveOutput, |
1713 | ReqPtrArgOffset); |
1714 | } |
1715 | |
1716 | /// Executes a kernel using pre-recorded information for loading to |
1717 | /// device memory to launch the target kernel with the pre-recorded |
1718 | /// configuration. |
1719 | int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
1720 | void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, |
1721 | ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, |
1722 | int32_t ThreadLimit, uint64_t LoopTripCount, |
1723 | AsyncInfoTy &AsyncInfo) { |
1724 | int32_t DeviceId = Device.DeviceID; |
1725 | TableMap *TM = getTableMap(HostPtr); |
1726 | // Fail if the table map fails to find the target kernel pointer for the |
1727 | // provided host pointer. |
1728 | if (!TM) { |
1729 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n" , |
1730 | DPxPTR(HostPtr)); |
1731 | return OFFLOAD_FAIL; |
1732 | } |
1733 | |
1734 | // Retrieve the target table of offloading entries. |
1735 | __tgt_target_table *TargetTable = nullptr; |
1736 | { |
1737 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1738 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId && |
1739 | "Not expecting a device ID outside the table's bounds!" ); |
1740 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
1741 | } |
1742 | assert(TargetTable && "Global data has not been mapped\n" ); |
1743 | |
1744 | // Retrieve the target kernel pointer, allocate and store the recorded device |
1745 | // memory data, and launch device execution. |
1746 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; |
1747 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n" , |
1748 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index); |
1749 | |
1750 | void *TgtPtr = Device.allocData(Size: DeviceMemorySize, /*HstPtr=*/nullptr, |
1751 | Kind: TARGET_ALLOC_DEFAULT); |
1752 | Device.submitData(TgtPtrBegin: TgtPtr, HstPtrBegin: DeviceMemory, Size: DeviceMemorySize, AsyncInfo); |
1753 | |
1754 | KernelArgsTy KernelArgs = {.Version: 0}; |
1755 | KernelArgs.Version = 2; |
1756 | KernelArgs.NumArgs = NumArgs; |
1757 | KernelArgs.Tripcount = LoopTripCount; |
1758 | KernelArgs.NumTeams[0] = NumTeams; |
1759 | KernelArgs.ThreadLimit[0] = ThreadLimit; |
1760 | |
1761 | int Ret = Device.launchKernel(TgtEntryPtr, TgtVarsPtr: TgtArgs, TgtOffsets, KernelArgs, |
1762 | AsyncInfo); |
1763 | |
1764 | if (Ret != OFFLOAD_SUCCESS) { |
1765 | REPORT("Executing target region abort target.\n" ); |
1766 | return OFFLOAD_FAIL; |
1767 | } |
1768 | |
1769 | return OFFLOAD_SUCCESS; |
1770 | } |
1771 | |