1 | // SPDX-License-Identifier: GPL-2.0 OR MIT |
2 | /* |
3 | * Copyright 2020-2021 Advanced Micro Devices, Inc. |
4 | * |
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
6 | * copy of this software and associated documentation files (the "Software"), |
7 | * to deal in the Software without restriction, including without limitation |
8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
9 | * and/or sell copies of the Software, and to permit persons to whom the |
10 | * Software is furnished to do so, subject to the following conditions: |
11 | * |
12 | * The above copyright notice and this permission notice shall be included in |
13 | * all copies or substantial portions of the Software. |
14 | * |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
21 | * OTHER DEALINGS IN THE SOFTWARE. |
22 | */ |
23 | |
24 | #include <linux/types.h> |
25 | #include <linux/sched/task.h> |
26 | #include <linux/dynamic_debug.h> |
27 | #include <drm/ttm/ttm_tt.h> |
28 | #include <drm/drm_exec.h> |
29 | |
30 | #include "amdgpu_sync.h" |
31 | #include "amdgpu_object.h" |
32 | #include "amdgpu_vm.h" |
33 | #include "amdgpu_hmm.h" |
34 | #include "amdgpu.h" |
35 | #include "amdgpu_xgmi.h" |
36 | #include "kfd_priv.h" |
37 | #include "kfd_svm.h" |
38 | #include "kfd_migrate.h" |
39 | #include "kfd_smi_events.h" |
40 | |
41 | #ifdef dev_fmt |
42 | #undef dev_fmt |
43 | #endif |
44 | #define dev_fmt(fmt) "kfd_svm: %s: " fmt, __func__ |
45 | |
46 | #define AMDGPU_SVM_RANGE_RESTORE_DELAY_MS 1 |
47 | |
48 | /* Long enough to ensure no retry fault comes after svm range is restored and |
49 | * page table is updated. |
50 | */ |
51 | #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC) |
52 | #if IS_ENABLED(CONFIG_DYNAMIC_DEBUG) |
53 | #define dynamic_svm_range_dump(svms) \ |
54 | _dynamic_func_call_no_desc("svm_range_dump", svm_range_debug_dump, svms) |
55 | #else |
56 | #define dynamic_svm_range_dump(svms) \ |
57 | do { if (0) svm_range_debug_dump(svms); } while (0) |
58 | #endif |
59 | |
60 | /* Giant svm range split into smaller ranges based on this, it is decided using |
61 | * minimum of all dGPU/APU 1/32 VRAM size, between 2MB to 1GB and alignment to |
62 | * power of 2MB. |
63 | */ |
64 | static uint64_t max_svm_range_pages; |
65 | |
66 | struct criu_svm_metadata { |
67 | struct list_head list; |
68 | struct kfd_criu_svm_range_priv_data data; |
69 | }; |
70 | |
71 | static void svm_range_evict_svm_bo_worker(struct work_struct *work); |
72 | static bool |
73 | svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, |
74 | const struct mmu_notifier_range *range, |
75 | unsigned long cur_seq); |
76 | static int |
77 | svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, |
78 | uint64_t *bo_s, uint64_t *bo_l); |
79 | static const struct mmu_interval_notifier_ops svm_range_mn_ops = { |
80 | .invalidate = svm_range_cpu_invalidate_pagetables, |
81 | }; |
82 | |
83 | /** |
84 | * svm_range_unlink - unlink svm_range from lists and interval tree |
85 | * @prange: svm range structure to be removed |
86 | * |
87 | * Remove the svm_range from the svms and svm_bo lists and the svms |
88 | * interval tree. |
89 | * |
90 | * Context: The caller must hold svms->lock |
91 | */ |
92 | static void svm_range_unlink(struct svm_range *prange) |
93 | { |
94 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n" , prange->svms, |
95 | prange, prange->start, prange->last); |
96 | |
97 | if (prange->svm_bo) { |
98 | spin_lock(lock: &prange->svm_bo->list_lock); |
99 | list_del(entry: &prange->svm_bo_list); |
100 | spin_unlock(lock: &prange->svm_bo->list_lock); |
101 | } |
102 | |
103 | list_del(entry: &prange->list); |
104 | if (prange->it_node.start != 0 && prange->it_node.last != 0) |
105 | interval_tree_remove(node: &prange->it_node, root: &prange->svms->objects); |
106 | } |
107 | |
108 | static void |
109 | svm_range_add_notifier_locked(struct mm_struct *mm, struct svm_range *prange) |
110 | { |
111 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n" , prange->svms, |
112 | prange, prange->start, prange->last); |
113 | |
114 | mmu_interval_notifier_insert_locked(interval_sub: &prange->notifier, mm, |
115 | start: prange->start << PAGE_SHIFT, |
116 | length: prange->npages << PAGE_SHIFT, |
117 | ops: &svm_range_mn_ops); |
118 | } |
119 | |
120 | /** |
121 | * svm_range_add_to_svms - add svm range to svms |
122 | * @prange: svm range structure to be added |
123 | * |
124 | * Add the svm range to svms interval tree and link list |
125 | * |
126 | * Context: The caller must hold svms->lock |
127 | */ |
128 | static void svm_range_add_to_svms(struct svm_range *prange) |
129 | { |
130 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n" , prange->svms, |
131 | prange, prange->start, prange->last); |
132 | |
133 | list_move_tail(list: &prange->list, head: &prange->svms->list); |
134 | prange->it_node.start = prange->start; |
135 | prange->it_node.last = prange->last; |
136 | interval_tree_insert(node: &prange->it_node, root: &prange->svms->objects); |
137 | } |
138 | |
139 | static void svm_range_remove_notifier(struct svm_range *prange) |
140 | { |
141 | pr_debug("remove notifier svms 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
142 | prange->svms, prange, |
143 | prange->notifier.interval_tree.start >> PAGE_SHIFT, |
144 | prange->notifier.interval_tree.last >> PAGE_SHIFT); |
145 | |
146 | if (prange->notifier.interval_tree.start != 0 && |
147 | prange->notifier.interval_tree.last != 0) |
148 | mmu_interval_notifier_remove(interval_sub: &prange->notifier); |
149 | } |
150 | |
151 | static bool |
152 | svm_is_valid_dma_mapping_addr(struct device *dev, dma_addr_t dma_addr) |
153 | { |
154 | return dma_addr && !dma_mapping_error(dev, dma_addr) && |
155 | !(dma_addr & SVM_RANGE_VRAM_DOMAIN); |
156 | } |
157 | |
158 | static int |
159 | svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange, |
160 | unsigned long offset, unsigned long npages, |
161 | unsigned long *hmm_pfns, uint32_t gpuidx) |
162 | { |
163 | enum dma_data_direction dir = DMA_BIDIRECTIONAL; |
164 | dma_addr_t *addr = prange->dma_addr[gpuidx]; |
165 | struct device *dev = adev->dev; |
166 | struct page *page; |
167 | int i, r; |
168 | |
169 | if (!addr) { |
170 | addr = kvcalloc(n: prange->npages, size: sizeof(*addr), GFP_KERNEL); |
171 | if (!addr) |
172 | return -ENOMEM; |
173 | prange->dma_addr[gpuidx] = addr; |
174 | } |
175 | |
176 | addr += offset; |
177 | for (i = 0; i < npages; i++) { |
178 | if (svm_is_valid_dma_mapping_addr(dev, dma_addr: addr[i])) |
179 | dma_unmap_page(dev, addr[i], PAGE_SIZE, dir); |
180 | |
181 | page = hmm_pfn_to_page(hmm_pfn: hmm_pfns[i]); |
182 | if (is_zone_device_page(page)) { |
183 | struct amdgpu_device *bo_adev = prange->svm_bo->node->adev; |
184 | |
185 | addr[i] = (hmm_pfns[i] << PAGE_SHIFT) + |
186 | bo_adev->vm_manager.vram_base_offset - |
187 | bo_adev->kfd.pgmap.range.start; |
188 | addr[i] |= SVM_RANGE_VRAM_DOMAIN; |
189 | pr_debug_ratelimited("vram address: 0x%llx\n" , addr[i]); |
190 | continue; |
191 | } |
192 | addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir); |
193 | r = dma_mapping_error(dev, dma_addr: addr[i]); |
194 | if (r) { |
195 | dev_err(dev, "failed %d dma_map_page\n" , r); |
196 | return r; |
197 | } |
198 | pr_debug_ratelimited("dma mapping 0x%llx for page addr 0x%lx\n" , |
199 | addr[i] >> PAGE_SHIFT, page_to_pfn(page)); |
200 | } |
201 | |
202 | return 0; |
203 | } |
204 | |
205 | static int |
206 | svm_range_dma_map(struct svm_range *prange, unsigned long *bitmap, |
207 | unsigned long offset, unsigned long npages, |
208 | unsigned long *hmm_pfns) |
209 | { |
210 | struct kfd_process *p; |
211 | uint32_t gpuidx; |
212 | int r; |
213 | |
214 | p = container_of(prange->svms, struct kfd_process, svms); |
215 | |
216 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { |
217 | struct kfd_process_device *pdd; |
218 | |
219 | pr_debug("mapping to gpu idx 0x%x\n" , gpuidx); |
220 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
221 | if (!pdd) { |
222 | pr_debug("failed to find device idx %d\n" , gpuidx); |
223 | return -EINVAL; |
224 | } |
225 | |
226 | r = svm_range_dma_map_dev(adev: pdd->dev->adev, prange, offset, npages, |
227 | hmm_pfns, gpuidx); |
228 | if (r) |
229 | break; |
230 | } |
231 | |
232 | return r; |
233 | } |
234 | |
235 | void svm_range_dma_unmap_dev(struct device *dev, dma_addr_t *dma_addr, |
236 | unsigned long offset, unsigned long npages) |
237 | { |
238 | enum dma_data_direction dir = DMA_BIDIRECTIONAL; |
239 | int i; |
240 | |
241 | if (!dma_addr) |
242 | return; |
243 | |
244 | for (i = offset; i < offset + npages; i++) { |
245 | if (!svm_is_valid_dma_mapping_addr(dev, dma_addr: dma_addr[i])) |
246 | continue; |
247 | pr_debug_ratelimited("unmap 0x%llx\n" , dma_addr[i] >> PAGE_SHIFT); |
248 | dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir); |
249 | dma_addr[i] = 0; |
250 | } |
251 | } |
252 | |
253 | void svm_range_dma_unmap(struct svm_range *prange) |
254 | { |
255 | struct kfd_process_device *pdd; |
256 | dma_addr_t *dma_addr; |
257 | struct device *dev; |
258 | struct kfd_process *p; |
259 | uint32_t gpuidx; |
260 | |
261 | p = container_of(prange->svms, struct kfd_process, svms); |
262 | |
263 | for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { |
264 | dma_addr = prange->dma_addr[gpuidx]; |
265 | if (!dma_addr) |
266 | continue; |
267 | |
268 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
269 | if (!pdd) { |
270 | pr_debug("failed to find device idx %d\n" , gpuidx); |
271 | continue; |
272 | } |
273 | dev = &pdd->dev->adev->pdev->dev; |
274 | |
275 | svm_range_dma_unmap_dev(dev, dma_addr, offset: 0, npages: prange->npages); |
276 | } |
277 | } |
278 | |
279 | static void svm_range_free(struct svm_range *prange, bool do_unmap) |
280 | { |
281 | uint64_t size = (prange->last - prange->start + 1) << PAGE_SHIFT; |
282 | struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); |
283 | uint32_t gpuidx; |
284 | |
285 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx]\n" , prange->svms, prange, |
286 | prange->start, prange->last); |
287 | |
288 | svm_range_vram_node_free(prange); |
289 | if (do_unmap) |
290 | svm_range_dma_unmap(prange); |
291 | |
292 | if (do_unmap && !p->xnack_enabled) { |
293 | pr_debug("unreserve prange 0x%p size: 0x%llx\n" , prange, size); |
294 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, |
295 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
296 | } |
297 | |
298 | /* free dma_addr array for each gpu */ |
299 | for (gpuidx = 0; gpuidx < MAX_GPU_INSTANCE; gpuidx++) { |
300 | if (prange->dma_addr[gpuidx]) { |
301 | kvfree(addr: prange->dma_addr[gpuidx]); |
302 | prange->dma_addr[gpuidx] = NULL; |
303 | } |
304 | } |
305 | |
306 | mutex_destroy(lock: &prange->lock); |
307 | mutex_destroy(lock: &prange->migrate_mutex); |
308 | kfree(objp: prange); |
309 | } |
310 | |
311 | static void |
312 | svm_range_set_default_attributes(int32_t *location, int32_t *prefetch_loc, |
313 | uint8_t *granularity, uint32_t *flags) |
314 | { |
315 | *location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
316 | *prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
317 | *granularity = 9; |
318 | *flags = |
319 | KFD_IOCTL_SVM_FLAG_HOST_ACCESS | KFD_IOCTL_SVM_FLAG_COHERENT; |
320 | } |
321 | |
322 | static struct |
323 | svm_range *svm_range_new(struct svm_range_list *svms, uint64_t start, |
324 | uint64_t last, bool update_mem_usage) |
325 | { |
326 | uint64_t size = last - start + 1; |
327 | struct svm_range *prange; |
328 | struct kfd_process *p; |
329 | |
330 | prange = kzalloc(size: sizeof(*prange), GFP_KERNEL); |
331 | if (!prange) |
332 | return NULL; |
333 | |
334 | p = container_of(svms, struct kfd_process, svms); |
335 | if (!p->xnack_enabled && update_mem_usage && |
336 | amdgpu_amdkfd_reserve_mem_limit(NULL, size: size << PAGE_SHIFT, |
337 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0)) { |
338 | pr_info("SVM mapping failed, exceeds resident system memory limit\n" ); |
339 | kfree(objp: prange); |
340 | return NULL; |
341 | } |
342 | prange->npages = size; |
343 | prange->svms = svms; |
344 | prange->start = start; |
345 | prange->last = last; |
346 | INIT_LIST_HEAD(list: &prange->list); |
347 | INIT_LIST_HEAD(list: &prange->update_list); |
348 | INIT_LIST_HEAD(list: &prange->svm_bo_list); |
349 | INIT_LIST_HEAD(list: &prange->deferred_list); |
350 | INIT_LIST_HEAD(list: &prange->child_list); |
351 | atomic_set(v: &prange->invalid, i: 0); |
352 | prange->validate_timestamp = 0; |
353 | prange->vram_pages = 0; |
354 | mutex_init(&prange->migrate_mutex); |
355 | mutex_init(&prange->lock); |
356 | |
357 | if (p->xnack_enabled) |
358 | bitmap_copy(dst: prange->bitmap_access, src: svms->bitmap_supported, |
359 | MAX_GPU_INSTANCE); |
360 | |
361 | svm_range_set_default_attributes(location: &prange->preferred_loc, |
362 | prefetch_loc: &prange->prefetch_loc, |
363 | granularity: &prange->granularity, flags: &prange->flags); |
364 | |
365 | pr_debug("svms 0x%p [0x%llx 0x%llx]\n" , svms, start, last); |
366 | |
367 | return prange; |
368 | } |
369 | |
370 | static bool svm_bo_ref_unless_zero(struct svm_range_bo *svm_bo) |
371 | { |
372 | if (!svm_bo || !kref_get_unless_zero(kref: &svm_bo->kref)) |
373 | return false; |
374 | |
375 | return true; |
376 | } |
377 | |
378 | static void svm_range_bo_release(struct kref *kref) |
379 | { |
380 | struct svm_range_bo *svm_bo; |
381 | |
382 | svm_bo = container_of(kref, struct svm_range_bo, kref); |
383 | pr_debug("svm_bo 0x%p\n" , svm_bo); |
384 | |
385 | spin_lock(lock: &svm_bo->list_lock); |
386 | while (!list_empty(head: &svm_bo->range_list)) { |
387 | struct svm_range *prange = |
388 | list_first_entry(&svm_bo->range_list, |
389 | struct svm_range, svm_bo_list); |
390 | /* list_del_init tells a concurrent svm_range_vram_node_new when |
391 | * it's safe to reuse the svm_bo pointer and svm_bo_list head. |
392 | */ |
393 | list_del_init(entry: &prange->svm_bo_list); |
394 | spin_unlock(lock: &svm_bo->list_lock); |
395 | |
396 | pr_debug("svms 0x%p [0x%lx 0x%lx]\n" , prange->svms, |
397 | prange->start, prange->last); |
398 | mutex_lock(&prange->lock); |
399 | prange->svm_bo = NULL; |
400 | /* prange should not hold vram page now */ |
401 | WARN_ONCE(prange->actual_loc, "prange should not hold vram page" ); |
402 | mutex_unlock(lock: &prange->lock); |
403 | |
404 | spin_lock(lock: &svm_bo->list_lock); |
405 | } |
406 | spin_unlock(lock: &svm_bo->list_lock); |
407 | if (!dma_fence_is_signaled(fence: &svm_bo->eviction_fence->base)) |
408 | /* We're not in the eviction worker. Signal the fence. */ |
409 | dma_fence_signal(fence: &svm_bo->eviction_fence->base); |
410 | dma_fence_put(fence: &svm_bo->eviction_fence->base); |
411 | amdgpu_bo_unref(bo: &svm_bo->bo); |
412 | kfree(objp: svm_bo); |
413 | } |
414 | |
415 | static void svm_range_bo_wq_release(struct work_struct *work) |
416 | { |
417 | struct svm_range_bo *svm_bo; |
418 | |
419 | svm_bo = container_of(work, struct svm_range_bo, release_work); |
420 | svm_range_bo_release(kref: &svm_bo->kref); |
421 | } |
422 | |
423 | static void svm_range_bo_release_async(struct kref *kref) |
424 | { |
425 | struct svm_range_bo *svm_bo; |
426 | |
427 | svm_bo = container_of(kref, struct svm_range_bo, kref); |
428 | pr_debug("svm_bo 0x%p\n" , svm_bo); |
429 | INIT_WORK(&svm_bo->release_work, svm_range_bo_wq_release); |
430 | schedule_work(work: &svm_bo->release_work); |
431 | } |
432 | |
433 | void svm_range_bo_unref_async(struct svm_range_bo *svm_bo) |
434 | { |
435 | kref_put(kref: &svm_bo->kref, release: svm_range_bo_release_async); |
436 | } |
437 | |
438 | static void svm_range_bo_unref(struct svm_range_bo *svm_bo) |
439 | { |
440 | if (svm_bo) |
441 | kref_put(kref: &svm_bo->kref, release: svm_range_bo_release); |
442 | } |
443 | |
444 | static bool |
445 | svm_range_validate_svm_bo(struct kfd_node *node, struct svm_range *prange) |
446 | { |
447 | mutex_lock(&prange->lock); |
448 | if (!prange->svm_bo) { |
449 | mutex_unlock(lock: &prange->lock); |
450 | return false; |
451 | } |
452 | if (prange->ttm_res) { |
453 | /* We still have a reference, all is well */ |
454 | mutex_unlock(lock: &prange->lock); |
455 | return true; |
456 | } |
457 | if (svm_bo_ref_unless_zero(svm_bo: prange->svm_bo)) { |
458 | /* |
459 | * Migrate from GPU to GPU, remove range from source svm_bo->node |
460 | * range list, and return false to allocate svm_bo from destination |
461 | * node. |
462 | */ |
463 | if (prange->svm_bo->node != node) { |
464 | mutex_unlock(lock: &prange->lock); |
465 | |
466 | spin_lock(lock: &prange->svm_bo->list_lock); |
467 | list_del_init(entry: &prange->svm_bo_list); |
468 | spin_unlock(lock: &prange->svm_bo->list_lock); |
469 | |
470 | svm_range_bo_unref(svm_bo: prange->svm_bo); |
471 | return false; |
472 | } |
473 | if (READ_ONCE(prange->svm_bo->evicting)) { |
474 | struct dma_fence *f; |
475 | struct svm_range_bo *svm_bo; |
476 | /* The BO is getting evicted, |
477 | * we need to get a new one |
478 | */ |
479 | mutex_unlock(lock: &prange->lock); |
480 | svm_bo = prange->svm_bo; |
481 | f = dma_fence_get(fence: &svm_bo->eviction_fence->base); |
482 | svm_range_bo_unref(svm_bo: prange->svm_bo); |
483 | /* wait for the fence to avoid long spin-loop |
484 | * at list_empty_careful |
485 | */ |
486 | dma_fence_wait(fence: f, intr: false); |
487 | dma_fence_put(fence: f); |
488 | } else { |
489 | /* The BO was still around and we got |
490 | * a new reference to it |
491 | */ |
492 | mutex_unlock(lock: &prange->lock); |
493 | pr_debug("reuse old bo svms 0x%p [0x%lx 0x%lx]\n" , |
494 | prange->svms, prange->start, prange->last); |
495 | |
496 | prange->ttm_res = prange->svm_bo->bo->tbo.resource; |
497 | return true; |
498 | } |
499 | |
500 | } else { |
501 | mutex_unlock(lock: &prange->lock); |
502 | } |
503 | |
504 | /* We need a new svm_bo. Spin-loop to wait for concurrent |
505 | * svm_range_bo_release to finish removing this range from |
506 | * its range list and set prange->svm_bo to null. After this, |
507 | * it is safe to reuse the svm_bo pointer and svm_bo_list head. |
508 | */ |
509 | while (!list_empty_careful(head: &prange->svm_bo_list) || prange->svm_bo) |
510 | cond_resched(); |
511 | |
512 | return false; |
513 | } |
514 | |
515 | static struct svm_range_bo *svm_range_bo_new(void) |
516 | { |
517 | struct svm_range_bo *svm_bo; |
518 | |
519 | svm_bo = kzalloc(size: sizeof(*svm_bo), GFP_KERNEL); |
520 | if (!svm_bo) |
521 | return NULL; |
522 | |
523 | kref_init(kref: &svm_bo->kref); |
524 | INIT_LIST_HEAD(list: &svm_bo->range_list); |
525 | spin_lock_init(&svm_bo->list_lock); |
526 | |
527 | return svm_bo; |
528 | } |
529 | |
530 | int |
531 | svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange, |
532 | bool clear) |
533 | { |
534 | struct amdgpu_bo_param bp; |
535 | struct svm_range_bo *svm_bo; |
536 | struct amdgpu_bo_user *ubo; |
537 | struct amdgpu_bo *bo; |
538 | struct kfd_process *p; |
539 | struct mm_struct *mm; |
540 | int r; |
541 | |
542 | p = container_of(prange->svms, struct kfd_process, svms); |
543 | pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n" , p->pasid, prange->svms, |
544 | prange->start, prange->last); |
545 | |
546 | if (svm_range_validate_svm_bo(node, prange)) |
547 | return 0; |
548 | |
549 | svm_bo = svm_range_bo_new(); |
550 | if (!svm_bo) { |
551 | pr_debug("failed to alloc svm bo\n" ); |
552 | return -ENOMEM; |
553 | } |
554 | mm = get_task_mm(task: p->lead_thread); |
555 | if (!mm) { |
556 | pr_debug("failed to get mm\n" ); |
557 | kfree(objp: svm_bo); |
558 | return -ESRCH; |
559 | } |
560 | svm_bo->node = node; |
561 | svm_bo->eviction_fence = |
562 | amdgpu_amdkfd_fence_create(context: dma_fence_context_alloc(num: 1), |
563 | mm, |
564 | svm_bo); |
565 | mmput(mm); |
566 | INIT_WORK(&svm_bo->eviction_work, svm_range_evict_svm_bo_worker); |
567 | svm_bo->evicting = 0; |
568 | memset(&bp, 0, sizeof(bp)); |
569 | bp.size = prange->npages * PAGE_SIZE; |
570 | bp.byte_align = PAGE_SIZE; |
571 | bp.domain = AMDGPU_GEM_DOMAIN_VRAM; |
572 | bp.flags = AMDGPU_GEM_CREATE_NO_CPU_ACCESS; |
573 | bp.flags |= clear ? AMDGPU_GEM_CREATE_VRAM_CLEARED : 0; |
574 | bp.flags |= AMDGPU_GEM_CREATE_DISCARDABLE; |
575 | bp.type = ttm_bo_type_device; |
576 | bp.resv = NULL; |
577 | if (node->xcp) |
578 | bp.xcp_id_plus1 = node->xcp->id + 1; |
579 | |
580 | r = amdgpu_bo_create_user(adev: node->adev, bp: &bp, ubo_ptr: &ubo); |
581 | if (r) { |
582 | pr_debug("failed %d to create bo\n" , r); |
583 | goto create_bo_failed; |
584 | } |
585 | bo = &ubo->bo; |
586 | |
587 | pr_debug("alloc bo at offset 0x%lx size 0x%lx on partition %d\n" , |
588 | bo->tbo.resource->start << PAGE_SHIFT, bp.size, |
589 | bp.xcp_id_plus1 - 1); |
590 | |
591 | r = amdgpu_bo_reserve(bo, no_intr: true); |
592 | if (r) { |
593 | pr_debug("failed %d to reserve bo\n" , r); |
594 | goto reserve_bo_failed; |
595 | } |
596 | |
597 | if (clear) { |
598 | r = amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, intr: false); |
599 | if (r) { |
600 | pr_debug("failed %d to sync bo\n" , r); |
601 | amdgpu_bo_unreserve(bo); |
602 | goto reserve_bo_failed; |
603 | } |
604 | } |
605 | |
606 | r = dma_resv_reserve_fences(obj: bo->tbo.base.resv, num_fences: 1); |
607 | if (r) { |
608 | pr_debug("failed %d to reserve bo\n" , r); |
609 | amdgpu_bo_unreserve(bo); |
610 | goto reserve_bo_failed; |
611 | } |
612 | amdgpu_bo_fence(bo, fence: &svm_bo->eviction_fence->base, shared: true); |
613 | |
614 | amdgpu_bo_unreserve(bo); |
615 | |
616 | svm_bo->bo = bo; |
617 | prange->svm_bo = svm_bo; |
618 | prange->ttm_res = bo->tbo.resource; |
619 | prange->offset = 0; |
620 | |
621 | spin_lock(lock: &svm_bo->list_lock); |
622 | list_add(new: &prange->svm_bo_list, head: &svm_bo->range_list); |
623 | spin_unlock(lock: &svm_bo->list_lock); |
624 | |
625 | return 0; |
626 | |
627 | reserve_bo_failed: |
628 | amdgpu_bo_unref(bo: &bo); |
629 | create_bo_failed: |
630 | dma_fence_put(fence: &svm_bo->eviction_fence->base); |
631 | kfree(objp: svm_bo); |
632 | prange->ttm_res = NULL; |
633 | |
634 | return r; |
635 | } |
636 | |
637 | void svm_range_vram_node_free(struct svm_range *prange) |
638 | { |
639 | /* serialize prange->svm_bo unref */ |
640 | mutex_lock(&prange->lock); |
641 | /* prange->svm_bo has not been unref */ |
642 | if (prange->ttm_res) { |
643 | prange->ttm_res = NULL; |
644 | mutex_unlock(lock: &prange->lock); |
645 | svm_range_bo_unref(svm_bo: prange->svm_bo); |
646 | } else |
647 | mutex_unlock(lock: &prange->lock); |
648 | } |
649 | |
650 | struct kfd_node * |
651 | svm_range_get_node_by_id(struct svm_range *prange, uint32_t gpu_id) |
652 | { |
653 | struct kfd_process *p; |
654 | struct kfd_process_device *pdd; |
655 | |
656 | p = container_of(prange->svms, struct kfd_process, svms); |
657 | pdd = kfd_process_device_data_by_id(process: p, gpu_id); |
658 | if (!pdd) { |
659 | pr_debug("failed to get kfd process device by id 0x%x\n" , gpu_id); |
660 | return NULL; |
661 | } |
662 | |
663 | return pdd->dev; |
664 | } |
665 | |
666 | struct kfd_process_device * |
667 | svm_range_get_pdd_by_node(struct svm_range *prange, struct kfd_node *node) |
668 | { |
669 | struct kfd_process *p; |
670 | |
671 | p = container_of(prange->svms, struct kfd_process, svms); |
672 | |
673 | return kfd_get_process_device_data(dev: node, p); |
674 | } |
675 | |
676 | static int svm_range_bo_validate(void *param, struct amdgpu_bo *bo) |
677 | { |
678 | struct ttm_operation_ctx ctx = { false, false }; |
679 | |
680 | amdgpu_bo_placement_from_domain(abo: bo, AMDGPU_GEM_DOMAIN_VRAM); |
681 | |
682 | return ttm_bo_validate(bo: &bo->tbo, placement: &bo->placement, ctx: &ctx); |
683 | } |
684 | |
685 | static int |
686 | svm_range_check_attr(struct kfd_process *p, |
687 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) |
688 | { |
689 | uint32_t i; |
690 | |
691 | for (i = 0; i < nattr; i++) { |
692 | uint32_t val = attrs[i].value; |
693 | int gpuidx = MAX_GPU_INSTANCE; |
694 | |
695 | switch (attrs[i].type) { |
696 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: |
697 | if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM && |
698 | val != KFD_IOCTL_SVM_LOCATION_UNDEFINED) |
699 | gpuidx = kfd_process_gpuidx_from_gpuid(p, gpu_id: val); |
700 | break; |
701 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
702 | if (val != KFD_IOCTL_SVM_LOCATION_SYSMEM) |
703 | gpuidx = kfd_process_gpuidx_from_gpuid(p, gpu_id: val); |
704 | break; |
705 | case KFD_IOCTL_SVM_ATTR_ACCESS: |
706 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: |
707 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: |
708 | gpuidx = kfd_process_gpuidx_from_gpuid(p, gpu_id: val); |
709 | break; |
710 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
711 | break; |
712 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
713 | break; |
714 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: |
715 | break; |
716 | default: |
717 | pr_debug("unknown attr type 0x%x\n" , attrs[i].type); |
718 | return -EINVAL; |
719 | } |
720 | |
721 | if (gpuidx < 0) { |
722 | pr_debug("no GPU 0x%x found\n" , val); |
723 | return -EINVAL; |
724 | } else if (gpuidx < MAX_GPU_INSTANCE && |
725 | !test_bit(gpuidx, p->svms.bitmap_supported)) { |
726 | pr_debug("GPU 0x%x not supported\n" , val); |
727 | return -EINVAL; |
728 | } |
729 | } |
730 | |
731 | return 0; |
732 | } |
733 | |
734 | static void |
735 | svm_range_apply_attrs(struct kfd_process *p, struct svm_range *prange, |
736 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, |
737 | bool *update_mapping) |
738 | { |
739 | uint32_t i; |
740 | int gpuidx; |
741 | |
742 | for (i = 0; i < nattr; i++) { |
743 | switch (attrs[i].type) { |
744 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: |
745 | prange->preferred_loc = attrs[i].value; |
746 | break; |
747 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
748 | prange->prefetch_loc = attrs[i].value; |
749 | break; |
750 | case KFD_IOCTL_SVM_ATTR_ACCESS: |
751 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: |
752 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: |
753 | if (!p->xnack_enabled) |
754 | *update_mapping = true; |
755 | |
756 | gpuidx = kfd_process_gpuidx_from_gpuid(p, |
757 | gpu_id: attrs[i].value); |
758 | if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { |
759 | bitmap_clear(map: prange->bitmap_access, start: gpuidx, nbits: 1); |
760 | bitmap_clear(map: prange->bitmap_aip, start: gpuidx, nbits: 1); |
761 | } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { |
762 | bitmap_set(map: prange->bitmap_access, start: gpuidx, nbits: 1); |
763 | bitmap_clear(map: prange->bitmap_aip, start: gpuidx, nbits: 1); |
764 | } else { |
765 | bitmap_clear(map: prange->bitmap_access, start: gpuidx, nbits: 1); |
766 | bitmap_set(map: prange->bitmap_aip, start: gpuidx, nbits: 1); |
767 | } |
768 | break; |
769 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
770 | *update_mapping = true; |
771 | prange->flags |= attrs[i].value; |
772 | break; |
773 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
774 | *update_mapping = true; |
775 | prange->flags &= ~attrs[i].value; |
776 | break; |
777 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: |
778 | prange->granularity = min_t(uint32_t, attrs[i].value, 0x3F); |
779 | break; |
780 | default: |
781 | WARN_ONCE(1, "svm_range_check_attrs wasn't called?" ); |
782 | } |
783 | } |
784 | } |
785 | |
786 | static bool |
787 | svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange, |
788 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs) |
789 | { |
790 | uint32_t i; |
791 | int gpuidx; |
792 | |
793 | for (i = 0; i < nattr; i++) { |
794 | switch (attrs[i].type) { |
795 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: |
796 | if (prange->preferred_loc != attrs[i].value) |
797 | return false; |
798 | break; |
799 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
800 | /* Prefetch should always trigger a migration even |
801 | * if the value of the attribute didn't change. |
802 | */ |
803 | return false; |
804 | case KFD_IOCTL_SVM_ATTR_ACCESS: |
805 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: |
806 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: |
807 | gpuidx = kfd_process_gpuidx_from_gpuid(p, |
808 | gpu_id: attrs[i].value); |
809 | if (attrs[i].type == KFD_IOCTL_SVM_ATTR_NO_ACCESS) { |
810 | if (test_bit(gpuidx, prange->bitmap_access) || |
811 | test_bit(gpuidx, prange->bitmap_aip)) |
812 | return false; |
813 | } else if (attrs[i].type == KFD_IOCTL_SVM_ATTR_ACCESS) { |
814 | if (!test_bit(gpuidx, prange->bitmap_access)) |
815 | return false; |
816 | } else { |
817 | if (!test_bit(gpuidx, prange->bitmap_aip)) |
818 | return false; |
819 | } |
820 | break; |
821 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
822 | if ((prange->flags & attrs[i].value) != attrs[i].value) |
823 | return false; |
824 | break; |
825 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
826 | if ((prange->flags & attrs[i].value) != 0) |
827 | return false; |
828 | break; |
829 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: |
830 | if (prange->granularity != attrs[i].value) |
831 | return false; |
832 | break; |
833 | default: |
834 | WARN_ONCE(1, "svm_range_check_attrs wasn't called?" ); |
835 | } |
836 | } |
837 | |
838 | return true; |
839 | } |
840 | |
841 | /** |
842 | * svm_range_debug_dump - print all range information from svms |
843 | * @svms: svm range list header |
844 | * |
845 | * debug output svm range start, end, prefetch location from svms |
846 | * interval tree and link list |
847 | * |
848 | * Context: The caller must hold svms->lock |
849 | */ |
850 | static void svm_range_debug_dump(struct svm_range_list *svms) |
851 | { |
852 | struct interval_tree_node *node; |
853 | struct svm_range *prange; |
854 | |
855 | pr_debug("dump svms 0x%p list\n" , svms); |
856 | pr_debug("range\tstart\tpage\tend\t\tlocation\n" ); |
857 | |
858 | list_for_each_entry(prange, &svms->list, list) { |
859 | pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n" , |
860 | prange, prange->start, prange->npages, |
861 | prange->start + prange->npages - 1, |
862 | prange->actual_loc); |
863 | } |
864 | |
865 | pr_debug("dump svms 0x%p interval tree\n" , svms); |
866 | pr_debug("range\tstart\tpage\tend\t\tlocation\n" ); |
867 | node = interval_tree_iter_first(root: &svms->objects, start: 0, last: ~0ULL); |
868 | while (node) { |
869 | prange = container_of(node, struct svm_range, it_node); |
870 | pr_debug("0x%p 0x%lx\t0x%llx\t0x%llx\t0x%x\n" , |
871 | prange, prange->start, prange->npages, |
872 | prange->start + prange->npages - 1, |
873 | prange->actual_loc); |
874 | node = interval_tree_iter_next(node, start: 0, last: ~0ULL); |
875 | } |
876 | } |
877 | |
878 | static void * |
879 | svm_range_copy_array(void *psrc, size_t size, uint64_t num_elements, |
880 | uint64_t offset, uint64_t *vram_pages) |
881 | { |
882 | unsigned char *src = (unsigned char *)psrc + offset; |
883 | unsigned char *dst; |
884 | uint64_t i; |
885 | |
886 | dst = kvmalloc_array(n: num_elements, size, GFP_KERNEL); |
887 | if (!dst) |
888 | return NULL; |
889 | |
890 | if (!vram_pages) { |
891 | memcpy(dst, src, num_elements * size); |
892 | return (void *)dst; |
893 | } |
894 | |
895 | *vram_pages = 0; |
896 | for (i = 0; i < num_elements; i++) { |
897 | dma_addr_t *temp; |
898 | temp = (dma_addr_t *)dst + i; |
899 | *temp = *((dma_addr_t *)src + i); |
900 | if (*temp&SVM_RANGE_VRAM_DOMAIN) |
901 | (*vram_pages)++; |
902 | } |
903 | |
904 | return (void *)dst; |
905 | } |
906 | |
907 | static int |
908 | svm_range_copy_dma_addrs(struct svm_range *dst, struct svm_range *src) |
909 | { |
910 | int i; |
911 | |
912 | for (i = 0; i < MAX_GPU_INSTANCE; i++) { |
913 | if (!src->dma_addr[i]) |
914 | continue; |
915 | dst->dma_addr[i] = svm_range_copy_array(psrc: src->dma_addr[i], |
916 | size: sizeof(*src->dma_addr[i]), num_elements: src->npages, offset: 0, NULL); |
917 | if (!dst->dma_addr[i]) |
918 | return -ENOMEM; |
919 | } |
920 | |
921 | return 0; |
922 | } |
923 | |
924 | static int |
925 | svm_range_split_array(void *ppnew, void *ppold, size_t size, |
926 | uint64_t old_start, uint64_t old_n, |
927 | uint64_t new_start, uint64_t new_n, uint64_t *new_vram_pages) |
928 | { |
929 | unsigned char *new, *old, *pold; |
930 | uint64_t d; |
931 | |
932 | if (!ppold) |
933 | return 0; |
934 | pold = *(unsigned char **)ppold; |
935 | if (!pold) |
936 | return 0; |
937 | |
938 | d = (new_start - old_start) * size; |
939 | /* get dma addr array for new range and calculte its vram page number */ |
940 | new = svm_range_copy_array(psrc: pold, size, num_elements: new_n, offset: d, vram_pages: new_vram_pages); |
941 | if (!new) |
942 | return -ENOMEM; |
943 | d = (new_start == old_start) ? new_n * size : 0; |
944 | old = svm_range_copy_array(psrc: pold, size, num_elements: old_n, offset: d, NULL); |
945 | if (!old) { |
946 | kvfree(addr: new); |
947 | return -ENOMEM; |
948 | } |
949 | kvfree(addr: pold); |
950 | *(void **)ppold = old; |
951 | *(void **)ppnew = new; |
952 | |
953 | return 0; |
954 | } |
955 | |
956 | static int |
957 | svm_range_split_pages(struct svm_range *new, struct svm_range *old, |
958 | uint64_t start, uint64_t last) |
959 | { |
960 | uint64_t npages = last - start + 1; |
961 | int i, r; |
962 | |
963 | for (i = 0; i < MAX_GPU_INSTANCE; i++) { |
964 | r = svm_range_split_array(ppnew: &new->dma_addr[i], ppold: &old->dma_addr[i], |
965 | size: sizeof(*old->dma_addr[i]), old_start: old->start, |
966 | old_n: npages, new_start: new->start, new_n: new->npages, |
967 | new_vram_pages: old->actual_loc ? &new->vram_pages : NULL); |
968 | if (r) |
969 | return r; |
970 | } |
971 | if (old->actual_loc) |
972 | old->vram_pages -= new->vram_pages; |
973 | |
974 | return 0; |
975 | } |
976 | |
977 | static int |
978 | svm_range_split_nodes(struct svm_range *new, struct svm_range *old, |
979 | uint64_t start, uint64_t last) |
980 | { |
981 | uint64_t npages = last - start + 1; |
982 | |
983 | pr_debug("svms 0x%p new prange 0x%p start 0x%lx [0x%llx 0x%llx]\n" , |
984 | new->svms, new, new->start, start, last); |
985 | |
986 | if (new->start == old->start) { |
987 | new->offset = old->offset; |
988 | old->offset += new->npages; |
989 | } else { |
990 | new->offset = old->offset + npages; |
991 | } |
992 | |
993 | new->svm_bo = svm_range_bo_ref(svm_bo: old->svm_bo); |
994 | new->ttm_res = old->ttm_res; |
995 | |
996 | spin_lock(lock: &new->svm_bo->list_lock); |
997 | list_add(new: &new->svm_bo_list, head: &new->svm_bo->range_list); |
998 | spin_unlock(lock: &new->svm_bo->list_lock); |
999 | |
1000 | return 0; |
1001 | } |
1002 | |
1003 | /** |
1004 | * svm_range_split_adjust - split range and adjust |
1005 | * |
1006 | * @new: new range |
1007 | * @old: the old range |
1008 | * @start: the old range adjust to start address in pages |
1009 | * @last: the old range adjust to last address in pages |
1010 | * |
1011 | * Copy system memory dma_addr or vram ttm_res in old range to new |
1012 | * range from new_start up to size new->npages, the remaining old range is from |
1013 | * start to last |
1014 | * |
1015 | * Return: |
1016 | * 0 - OK, -ENOMEM - out of memory |
1017 | */ |
1018 | static int |
1019 | svm_range_split_adjust(struct svm_range *new, struct svm_range *old, |
1020 | uint64_t start, uint64_t last) |
1021 | { |
1022 | int r; |
1023 | |
1024 | pr_debug("svms 0x%p new 0x%lx old [0x%lx 0x%lx] => [0x%llx 0x%llx]\n" , |
1025 | new->svms, new->start, old->start, old->last, start, last); |
1026 | |
1027 | if (new->start < old->start || |
1028 | new->last > old->last) { |
1029 | WARN_ONCE(1, "invalid new range start or last\n" ); |
1030 | return -EINVAL; |
1031 | } |
1032 | |
1033 | r = svm_range_split_pages(new, old, start, last); |
1034 | if (r) |
1035 | return r; |
1036 | |
1037 | if (old->actual_loc && old->ttm_res) { |
1038 | r = svm_range_split_nodes(new, old, start, last); |
1039 | if (r) |
1040 | return r; |
1041 | } |
1042 | |
1043 | old->npages = last - start + 1; |
1044 | old->start = start; |
1045 | old->last = last; |
1046 | new->flags = old->flags; |
1047 | new->preferred_loc = old->preferred_loc; |
1048 | new->prefetch_loc = old->prefetch_loc; |
1049 | new->actual_loc = old->actual_loc; |
1050 | new->granularity = old->granularity; |
1051 | new->mapped_to_gpu = old->mapped_to_gpu; |
1052 | bitmap_copy(dst: new->bitmap_access, src: old->bitmap_access, MAX_GPU_INSTANCE); |
1053 | bitmap_copy(dst: new->bitmap_aip, src: old->bitmap_aip, MAX_GPU_INSTANCE); |
1054 | |
1055 | return 0; |
1056 | } |
1057 | |
1058 | /** |
1059 | * svm_range_split - split a range in 2 ranges |
1060 | * |
1061 | * @prange: the svm range to split |
1062 | * @start: the remaining range start address in pages |
1063 | * @last: the remaining range last address in pages |
1064 | * @new: the result new range generated |
1065 | * |
1066 | * Two cases only: |
1067 | * case 1: if start == prange->start |
1068 | * prange ==> prange[start, last] |
1069 | * new range [last + 1, prange->last] |
1070 | * |
1071 | * case 2: if last == prange->last |
1072 | * prange ==> prange[start, last] |
1073 | * new range [prange->start, start - 1] |
1074 | * |
1075 | * Return: |
1076 | * 0 - OK, -ENOMEM - out of memory, -EINVAL - invalid start, last |
1077 | */ |
1078 | static int |
1079 | svm_range_split(struct svm_range *prange, uint64_t start, uint64_t last, |
1080 | struct svm_range **new) |
1081 | { |
1082 | uint64_t old_start = prange->start; |
1083 | uint64_t old_last = prange->last; |
1084 | struct svm_range_list *svms; |
1085 | int r = 0; |
1086 | |
1087 | pr_debug("svms 0x%p [0x%llx 0x%llx] to [0x%llx 0x%llx]\n" , prange->svms, |
1088 | old_start, old_last, start, last); |
1089 | |
1090 | if (old_start != start && old_last != last) |
1091 | return -EINVAL; |
1092 | if (start < old_start || last > old_last) |
1093 | return -EINVAL; |
1094 | |
1095 | svms = prange->svms; |
1096 | if (old_start == start) |
1097 | *new = svm_range_new(svms, start: last + 1, last: old_last, update_mem_usage: false); |
1098 | else |
1099 | *new = svm_range_new(svms, start: old_start, last: start - 1, update_mem_usage: false); |
1100 | if (!*new) |
1101 | return -ENOMEM; |
1102 | |
1103 | r = svm_range_split_adjust(new: *new, old: prange, start, last); |
1104 | if (r) { |
1105 | pr_debug("failed %d split [0x%llx 0x%llx] to [0x%llx 0x%llx]\n" , |
1106 | r, old_start, old_last, start, last); |
1107 | svm_range_free(prange: *new, do_unmap: false); |
1108 | *new = NULL; |
1109 | } |
1110 | |
1111 | return r; |
1112 | } |
1113 | |
1114 | static int |
1115 | svm_range_split_tail(struct svm_range *prange, uint64_t new_last, |
1116 | struct list_head *insert_list, struct list_head *remap_list) |
1117 | { |
1118 | struct svm_range *tail = NULL; |
1119 | int r = svm_range_split(prange, start: prange->start, last: new_last, new: &tail); |
1120 | |
1121 | if (!r) { |
1122 | list_add(new: &tail->list, head: insert_list); |
1123 | if (!IS_ALIGNED(new_last + 1, 1UL << prange->granularity)) |
1124 | list_add(new: &tail->update_list, head: remap_list); |
1125 | } |
1126 | return r; |
1127 | } |
1128 | |
1129 | static int |
1130 | svm_range_split_head(struct svm_range *prange, uint64_t new_start, |
1131 | struct list_head *insert_list, struct list_head *remap_list) |
1132 | { |
1133 | struct svm_range *head = NULL; |
1134 | int r = svm_range_split(prange, start: new_start, last: prange->last, new: &head); |
1135 | |
1136 | if (!r) { |
1137 | list_add(new: &head->list, head: insert_list); |
1138 | if (!IS_ALIGNED(new_start, 1UL << prange->granularity)) |
1139 | list_add(new: &head->update_list, head: remap_list); |
1140 | } |
1141 | return r; |
1142 | } |
1143 | |
1144 | static void |
1145 | svm_range_add_child(struct svm_range *prange, struct mm_struct *mm, |
1146 | struct svm_range *pchild, enum svm_work_list_ops op) |
1147 | { |
1148 | pr_debug("add child 0x%p [0x%lx 0x%lx] to prange 0x%p child list %d\n" , |
1149 | pchild, pchild->start, pchild->last, prange, op); |
1150 | |
1151 | pchild->work_item.mm = mm; |
1152 | pchild->work_item.op = op; |
1153 | list_add_tail(new: &pchild->child_list, head: &prange->child_list); |
1154 | } |
1155 | |
1156 | static bool |
1157 | svm_nodes_in_same_hive(struct kfd_node *node_a, struct kfd_node *node_b) |
1158 | { |
1159 | return (node_a->adev == node_b->adev || |
1160 | amdgpu_xgmi_same_hive(adev: node_a->adev, bo_adev: node_b->adev)); |
1161 | } |
1162 | |
1163 | static uint64_t |
1164 | svm_range_get_pte_flags(struct kfd_node *node, |
1165 | struct svm_range *prange, int domain) |
1166 | { |
1167 | struct kfd_node *bo_node; |
1168 | uint32_t flags = prange->flags; |
1169 | uint32_t mapping_flags = 0; |
1170 | uint64_t pte_flags; |
1171 | bool snoop = (domain != SVM_RANGE_VRAM_DOMAIN); |
1172 | bool coherent = flags & (KFD_IOCTL_SVM_FLAG_COHERENT | KFD_IOCTL_SVM_FLAG_EXT_COHERENT); |
1173 | bool ext_coherent = flags & KFD_IOCTL_SVM_FLAG_EXT_COHERENT; |
1174 | bool uncached = false; /*flags & KFD_IOCTL_SVM_FLAG_UNCACHED;*/ |
1175 | unsigned int mtype_local; |
1176 | |
1177 | if (domain == SVM_RANGE_VRAM_DOMAIN) |
1178 | bo_node = prange->svm_bo->node; |
1179 | |
1180 | switch (amdgpu_ip_version(adev: node->adev, ip: GC_HWIP, inst: 0)) { |
1181 | case IP_VERSION(9, 4, 1): |
1182 | if (domain == SVM_RANGE_VRAM_DOMAIN) { |
1183 | if (bo_node == node) { |
1184 | mapping_flags |= coherent ? |
1185 | AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; |
1186 | } else { |
1187 | mapping_flags |= coherent ? |
1188 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1189 | if (svm_nodes_in_same_hive(node_a: node, node_b: bo_node)) |
1190 | snoop = true; |
1191 | } |
1192 | } else { |
1193 | mapping_flags |= coherent ? |
1194 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1195 | } |
1196 | break; |
1197 | case IP_VERSION(9, 4, 2): |
1198 | if (domain == SVM_RANGE_VRAM_DOMAIN) { |
1199 | if (bo_node == node) { |
1200 | mapping_flags |= coherent ? |
1201 | AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; |
1202 | if (node->adev->gmc.xgmi.connected_to_cpu) |
1203 | snoop = true; |
1204 | } else { |
1205 | mapping_flags |= coherent ? |
1206 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1207 | if (svm_nodes_in_same_hive(node_a: node, node_b: bo_node)) |
1208 | snoop = true; |
1209 | } |
1210 | } else { |
1211 | mapping_flags |= coherent ? |
1212 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1213 | } |
1214 | break; |
1215 | case IP_VERSION(9, 4, 3): |
1216 | if (ext_coherent) |
1217 | mtype_local = node->adev->rev_id ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_UC; |
1218 | else |
1219 | mtype_local = amdgpu_mtype_local == 1 ? AMDGPU_VM_MTYPE_NC : |
1220 | amdgpu_mtype_local == 2 ? AMDGPU_VM_MTYPE_CC : AMDGPU_VM_MTYPE_RW; |
1221 | snoop = true; |
1222 | if (uncached) { |
1223 | mapping_flags |= AMDGPU_VM_MTYPE_UC; |
1224 | } else if (domain == SVM_RANGE_VRAM_DOMAIN) { |
1225 | /* local HBM region close to partition */ |
1226 | if (bo_node->adev == node->adev && |
1227 | (!bo_node->xcp || !node->xcp || bo_node->xcp->mem_id == node->xcp->mem_id)) |
1228 | mapping_flags |= mtype_local; |
1229 | /* local HBM region far from partition or remote XGMI GPU |
1230 | * with regular system scope coherence |
1231 | */ |
1232 | else if (svm_nodes_in_same_hive(node_a: bo_node, node_b: node) && !ext_coherent) |
1233 | mapping_flags |= AMDGPU_VM_MTYPE_NC; |
1234 | /* PCIe P2P or extended system scope coherence */ |
1235 | else |
1236 | mapping_flags |= AMDGPU_VM_MTYPE_UC; |
1237 | /* system memory accessed by the APU */ |
1238 | } else if (node->adev->flags & AMD_IS_APU) { |
1239 | /* On NUMA systems, locality is determined per-page |
1240 | * in amdgpu_gmc_override_vm_pte_flags |
1241 | */ |
1242 | if (num_possible_nodes() <= 1) |
1243 | mapping_flags |= mtype_local; |
1244 | else |
1245 | mapping_flags |= ext_coherent ? AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1246 | /* system memory accessed by the dGPU */ |
1247 | } else { |
1248 | mapping_flags |= AMDGPU_VM_MTYPE_UC; |
1249 | } |
1250 | break; |
1251 | default: |
1252 | mapping_flags |= coherent ? |
1253 | AMDGPU_VM_MTYPE_UC : AMDGPU_VM_MTYPE_NC; |
1254 | } |
1255 | |
1256 | mapping_flags |= AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE; |
1257 | |
1258 | if (flags & KFD_IOCTL_SVM_FLAG_GPU_RO) |
1259 | mapping_flags &= ~AMDGPU_VM_PAGE_WRITEABLE; |
1260 | if (flags & KFD_IOCTL_SVM_FLAG_GPU_EXEC) |
1261 | mapping_flags |= AMDGPU_VM_PAGE_EXECUTABLE; |
1262 | |
1263 | pte_flags = AMDGPU_PTE_VALID; |
1264 | pte_flags |= (domain == SVM_RANGE_VRAM_DOMAIN) ? 0 : AMDGPU_PTE_SYSTEM; |
1265 | pte_flags |= snoop ? AMDGPU_PTE_SNOOPED : 0; |
1266 | |
1267 | pte_flags |= amdgpu_gem_va_map_flags(adev: node->adev, flags: mapping_flags); |
1268 | return pte_flags; |
1269 | } |
1270 | |
1271 | static int |
1272 | svm_range_unmap_from_gpu(struct amdgpu_device *adev, struct amdgpu_vm *vm, |
1273 | uint64_t start, uint64_t last, |
1274 | struct dma_fence **fence) |
1275 | { |
1276 | uint64_t init_pte_value = 0; |
1277 | |
1278 | pr_debug("[0x%llx 0x%llx]\n" , start, last); |
1279 | |
1280 | return amdgpu_vm_update_range(adev, vm, immediate: false, unlocked: true, flush_tlb: true, allow_override: false, NULL, start, |
1281 | last, flags: init_pte_value, offset: 0, vram_base: 0, NULL, NULL, |
1282 | fence); |
1283 | } |
1284 | |
1285 | static int |
1286 | svm_range_unmap_from_gpus(struct svm_range *prange, unsigned long start, |
1287 | unsigned long last, uint32_t trigger) |
1288 | { |
1289 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); |
1290 | struct kfd_process_device *pdd; |
1291 | struct dma_fence *fence = NULL; |
1292 | struct kfd_process *p; |
1293 | uint32_t gpuidx; |
1294 | int r = 0; |
1295 | |
1296 | if (!prange->mapped_to_gpu) { |
1297 | pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to GPU\n" , |
1298 | prange, prange->start, prange->last); |
1299 | return 0; |
1300 | } |
1301 | |
1302 | if (prange->start == start && prange->last == last) { |
1303 | pr_debug("unmap svms 0x%p prange 0x%p\n" , prange->svms, prange); |
1304 | prange->mapped_to_gpu = false; |
1305 | } |
1306 | |
1307 | bitmap_or(dst: bitmap, src1: prange->bitmap_access, src2: prange->bitmap_aip, |
1308 | MAX_GPU_INSTANCE); |
1309 | p = container_of(prange->svms, struct kfd_process, svms); |
1310 | |
1311 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { |
1312 | pr_debug("unmap from gpu idx 0x%x\n" , gpuidx); |
1313 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
1314 | if (!pdd) { |
1315 | pr_debug("failed to find device idx %d\n" , gpuidx); |
1316 | return -EINVAL; |
1317 | } |
1318 | |
1319 | kfd_smi_event_unmap_from_gpu(node: pdd->dev, pid: p->lead_thread->pid, |
1320 | address: start, last, trigger); |
1321 | |
1322 | r = svm_range_unmap_from_gpu(adev: pdd->dev->adev, |
1323 | drm_priv_to_vm(pdd->drm_priv), |
1324 | start, last, fence: &fence); |
1325 | if (r) |
1326 | break; |
1327 | |
1328 | if (fence) { |
1329 | r = dma_fence_wait(fence, intr: false); |
1330 | dma_fence_put(fence); |
1331 | fence = NULL; |
1332 | if (r) |
1333 | break; |
1334 | } |
1335 | kfd_flush_tlb(pdd, type: TLB_FLUSH_HEAVYWEIGHT); |
1336 | } |
1337 | |
1338 | return r; |
1339 | } |
1340 | |
1341 | static int |
1342 | svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange, |
1343 | unsigned long offset, unsigned long npages, bool readonly, |
1344 | dma_addr_t *dma_addr, struct amdgpu_device *bo_adev, |
1345 | struct dma_fence **fence, bool flush_tlb) |
1346 | { |
1347 | struct amdgpu_device *adev = pdd->dev->adev; |
1348 | struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv); |
1349 | uint64_t pte_flags; |
1350 | unsigned long last_start; |
1351 | int last_domain; |
1352 | int r = 0; |
1353 | int64_t i, j; |
1354 | |
1355 | last_start = prange->start + offset; |
1356 | |
1357 | pr_debug("svms 0x%p [0x%lx 0x%lx] readonly %d\n" , prange->svms, |
1358 | last_start, last_start + npages - 1, readonly); |
1359 | |
1360 | for (i = offset; i < offset + npages; i++) { |
1361 | last_domain = dma_addr[i] & SVM_RANGE_VRAM_DOMAIN; |
1362 | dma_addr[i] &= ~SVM_RANGE_VRAM_DOMAIN; |
1363 | |
1364 | /* Collect all pages in the same address range and memory domain |
1365 | * that can be mapped with a single call to update mapping. |
1366 | */ |
1367 | if (i < offset + npages - 1 && |
1368 | last_domain == (dma_addr[i + 1] & SVM_RANGE_VRAM_DOMAIN)) |
1369 | continue; |
1370 | |
1371 | pr_debug("Mapping range [0x%lx 0x%llx] on domain: %s\n" , |
1372 | last_start, prange->start + i, last_domain ? "GPU" : "CPU" ); |
1373 | |
1374 | pte_flags = svm_range_get_pte_flags(node: pdd->dev, prange, domain: last_domain); |
1375 | if (readonly) |
1376 | pte_flags &= ~AMDGPU_PTE_WRITEABLE; |
1377 | |
1378 | pr_debug("svms 0x%p map [0x%lx 0x%llx] vram %d PTE 0x%llx\n" , |
1379 | prange->svms, last_start, prange->start + i, |
1380 | (last_domain == SVM_RANGE_VRAM_DOMAIN) ? 1 : 0, |
1381 | pte_flags); |
1382 | |
1383 | /* For dGPU mode, we use same vm_manager to allocate VRAM for |
1384 | * different memory partition based on fpfn/lpfn, we should use |
1385 | * same vm_manager.vram_base_offset regardless memory partition. |
1386 | */ |
1387 | r = amdgpu_vm_update_range(adev, vm, immediate: false, unlocked: false, flush_tlb, allow_override: true, |
1388 | NULL, start: last_start, last: prange->start + i, |
1389 | flags: pte_flags, |
1390 | offset: (last_start - prange->start) << PAGE_SHIFT, |
1391 | vram_base: bo_adev ? bo_adev->vm_manager.vram_base_offset : 0, |
1392 | NULL, pages_addr: dma_addr, fence: &vm->last_update); |
1393 | |
1394 | for (j = last_start - prange->start; j <= i; j++) |
1395 | dma_addr[j] |= last_domain; |
1396 | |
1397 | if (r) { |
1398 | pr_debug("failed %d to map to gpu 0x%lx\n" , r, prange->start); |
1399 | goto out; |
1400 | } |
1401 | last_start = prange->start + i + 1; |
1402 | } |
1403 | |
1404 | r = amdgpu_vm_update_pdes(adev, vm, immediate: false); |
1405 | if (r) { |
1406 | pr_debug("failed %d to update directories 0x%lx\n" , r, |
1407 | prange->start); |
1408 | goto out; |
1409 | } |
1410 | |
1411 | if (fence) |
1412 | *fence = dma_fence_get(fence: vm->last_update); |
1413 | |
1414 | out: |
1415 | return r; |
1416 | } |
1417 | |
1418 | static int |
1419 | svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset, |
1420 | unsigned long npages, bool readonly, |
1421 | unsigned long *bitmap, bool wait, bool flush_tlb) |
1422 | { |
1423 | struct kfd_process_device *pdd; |
1424 | struct amdgpu_device *bo_adev = NULL; |
1425 | struct kfd_process *p; |
1426 | struct dma_fence *fence = NULL; |
1427 | uint32_t gpuidx; |
1428 | int r = 0; |
1429 | |
1430 | if (prange->svm_bo && prange->ttm_res) |
1431 | bo_adev = prange->svm_bo->node->adev; |
1432 | |
1433 | p = container_of(prange->svms, struct kfd_process, svms); |
1434 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { |
1435 | pr_debug("mapping to gpu idx 0x%x\n" , gpuidx); |
1436 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
1437 | if (!pdd) { |
1438 | pr_debug("failed to find device idx %d\n" , gpuidx); |
1439 | return -EINVAL; |
1440 | } |
1441 | |
1442 | pdd = kfd_bind_process_to_device(dev: pdd->dev, p); |
1443 | if (IS_ERR(ptr: pdd)) |
1444 | return -EINVAL; |
1445 | |
1446 | if (bo_adev && pdd->dev->adev != bo_adev && |
1447 | !amdgpu_xgmi_same_hive(adev: pdd->dev->adev, bo_adev)) { |
1448 | pr_debug("cannot map to device idx %d\n" , gpuidx); |
1449 | continue; |
1450 | } |
1451 | |
1452 | r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly, |
1453 | dma_addr: prange->dma_addr[gpuidx], |
1454 | bo_adev, fence: wait ? &fence : NULL, |
1455 | flush_tlb); |
1456 | if (r) |
1457 | break; |
1458 | |
1459 | if (fence) { |
1460 | r = dma_fence_wait(fence, intr: false); |
1461 | dma_fence_put(fence); |
1462 | fence = NULL; |
1463 | if (r) { |
1464 | pr_debug("failed %d to dma fence wait\n" , r); |
1465 | break; |
1466 | } |
1467 | } |
1468 | |
1469 | kfd_flush_tlb(pdd, type: TLB_FLUSH_LEGACY); |
1470 | } |
1471 | |
1472 | return r; |
1473 | } |
1474 | |
1475 | struct svm_validate_context { |
1476 | struct kfd_process *process; |
1477 | struct svm_range *prange; |
1478 | bool intr; |
1479 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); |
1480 | struct drm_exec exec; |
1481 | }; |
1482 | |
1483 | static int svm_range_reserve_bos(struct svm_validate_context *ctx, bool intr) |
1484 | { |
1485 | struct kfd_process_device *pdd; |
1486 | struct amdgpu_vm *vm; |
1487 | uint32_t gpuidx; |
1488 | int r; |
1489 | |
1490 | drm_exec_init(exec: &ctx->exec, flags: intr ? DRM_EXEC_INTERRUPTIBLE_WAIT: 0, nr: 0); |
1491 | drm_exec_until_all_locked(&ctx->exec) { |
1492 | for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { |
1493 | pdd = kfd_process_device_from_gpuidx(p: ctx->process, gpuidx); |
1494 | if (!pdd) { |
1495 | pr_debug("failed to find device idx %d\n" , gpuidx); |
1496 | r = -EINVAL; |
1497 | goto unreserve_out; |
1498 | } |
1499 | vm = drm_priv_to_vm(pdd->drm_priv); |
1500 | |
1501 | r = amdgpu_vm_lock_pd(vm, exec: &ctx->exec, num_fences: 2); |
1502 | drm_exec_retry_on_contention(&ctx->exec); |
1503 | if (unlikely(r)) { |
1504 | pr_debug("failed %d to reserve bo\n" , r); |
1505 | goto unreserve_out; |
1506 | } |
1507 | } |
1508 | } |
1509 | |
1510 | for_each_set_bit(gpuidx, ctx->bitmap, MAX_GPU_INSTANCE) { |
1511 | pdd = kfd_process_device_from_gpuidx(p: ctx->process, gpuidx); |
1512 | if (!pdd) { |
1513 | pr_debug("failed to find device idx %d\n" , gpuidx); |
1514 | r = -EINVAL; |
1515 | goto unreserve_out; |
1516 | } |
1517 | |
1518 | r = amdgpu_vm_validate(adev: pdd->dev->adev, |
1519 | drm_priv_to_vm(pdd->drm_priv), NULL, |
1520 | callback: svm_range_bo_validate, NULL); |
1521 | if (r) { |
1522 | pr_debug("failed %d validate pt bos\n" , r); |
1523 | goto unreserve_out; |
1524 | } |
1525 | } |
1526 | |
1527 | return 0; |
1528 | |
1529 | unreserve_out: |
1530 | drm_exec_fini(exec: &ctx->exec); |
1531 | return r; |
1532 | } |
1533 | |
1534 | static void svm_range_unreserve_bos(struct svm_validate_context *ctx) |
1535 | { |
1536 | drm_exec_fini(exec: &ctx->exec); |
1537 | } |
1538 | |
1539 | static void *kfd_svm_page_owner(struct kfd_process *p, int32_t gpuidx) |
1540 | { |
1541 | struct kfd_process_device *pdd; |
1542 | |
1543 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
1544 | if (!pdd) |
1545 | return NULL; |
1546 | |
1547 | return SVM_ADEV_PGMAP_OWNER(pdd->dev->adev); |
1548 | } |
1549 | |
1550 | /* |
1551 | * Validation+GPU mapping with concurrent invalidation (MMU notifiers) |
1552 | * |
1553 | * To prevent concurrent destruction or change of range attributes, the |
1554 | * svm_read_lock must be held. The caller must not hold the svm_write_lock |
1555 | * because that would block concurrent evictions and lead to deadlocks. To |
1556 | * serialize concurrent migrations or validations of the same range, the |
1557 | * prange->migrate_mutex must be held. |
1558 | * |
1559 | * For VRAM ranges, the SVM BO must be allocated and valid (protected by its |
1560 | * eviction fence. |
1561 | * |
1562 | * The following sequence ensures race-free validation and GPU mapping: |
1563 | * |
1564 | * 1. Reserve page table (and SVM BO if range is in VRAM) |
1565 | * 2. hmm_range_fault to get page addresses (if system memory) |
1566 | * 3. DMA-map pages (if system memory) |
1567 | * 4-a. Take notifier lock |
1568 | * 4-b. Check that pages still valid (mmu_interval_read_retry) |
1569 | * 4-c. Check that the range was not split or otherwise invalidated |
1570 | * 4-d. Update GPU page table |
1571 | * 4.e. Release notifier lock |
1572 | * 5. Release page table (and SVM BO) reservation |
1573 | */ |
1574 | static int svm_range_validate_and_map(struct mm_struct *mm, |
1575 | unsigned long map_start, unsigned long map_last, |
1576 | struct svm_range *prange, int32_t gpuidx, |
1577 | bool intr, bool wait, bool flush_tlb) |
1578 | { |
1579 | struct svm_validate_context *ctx; |
1580 | unsigned long start, end, addr; |
1581 | struct kfd_process *p; |
1582 | void *owner; |
1583 | int32_t idx; |
1584 | int r = 0; |
1585 | |
1586 | ctx = kzalloc(size: sizeof(struct svm_validate_context), GFP_KERNEL); |
1587 | if (!ctx) |
1588 | return -ENOMEM; |
1589 | ctx->process = container_of(prange->svms, struct kfd_process, svms); |
1590 | ctx->prange = prange; |
1591 | ctx->intr = intr; |
1592 | |
1593 | if (gpuidx < MAX_GPU_INSTANCE) { |
1594 | bitmap_zero(dst: ctx->bitmap, MAX_GPU_INSTANCE); |
1595 | bitmap_set(map: ctx->bitmap, start: gpuidx, nbits: 1); |
1596 | } else if (ctx->process->xnack_enabled) { |
1597 | bitmap_copy(dst: ctx->bitmap, src: prange->bitmap_aip, MAX_GPU_INSTANCE); |
1598 | |
1599 | /* If prefetch range to GPU, or GPU retry fault migrate range to |
1600 | * GPU, which has ACCESS attribute to the range, create mapping |
1601 | * on that GPU. |
1602 | */ |
1603 | if (prange->actual_loc) { |
1604 | gpuidx = kfd_process_gpuidx_from_gpuid(p: ctx->process, |
1605 | gpu_id: prange->actual_loc); |
1606 | if (gpuidx < 0) { |
1607 | WARN_ONCE(1, "failed get device by id 0x%x\n" , |
1608 | prange->actual_loc); |
1609 | r = -EINVAL; |
1610 | goto free_ctx; |
1611 | } |
1612 | if (test_bit(gpuidx, prange->bitmap_access)) |
1613 | bitmap_set(map: ctx->bitmap, start: gpuidx, nbits: 1); |
1614 | } |
1615 | |
1616 | /* |
1617 | * If prange is already mapped or with always mapped flag, |
1618 | * update mapping on GPUs with ACCESS attribute |
1619 | */ |
1620 | if (bitmap_empty(src: ctx->bitmap, MAX_GPU_INSTANCE)) { |
1621 | if (prange->mapped_to_gpu || |
1622 | prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED) |
1623 | bitmap_copy(dst: ctx->bitmap, src: prange->bitmap_access, MAX_GPU_INSTANCE); |
1624 | } |
1625 | } else { |
1626 | bitmap_or(dst: ctx->bitmap, src1: prange->bitmap_access, |
1627 | src2: prange->bitmap_aip, MAX_GPU_INSTANCE); |
1628 | } |
1629 | |
1630 | if (bitmap_empty(src: ctx->bitmap, MAX_GPU_INSTANCE)) { |
1631 | r = 0; |
1632 | goto free_ctx; |
1633 | } |
1634 | |
1635 | if (prange->actual_loc && !prange->ttm_res) { |
1636 | /* This should never happen. actual_loc gets set by |
1637 | * svm_migrate_ram_to_vram after allocating a BO. |
1638 | */ |
1639 | WARN_ONCE(1, "VRAM BO missing during validation\n" ); |
1640 | r = -EINVAL; |
1641 | goto free_ctx; |
1642 | } |
1643 | |
1644 | r = svm_range_reserve_bos(ctx, intr); |
1645 | if (r) |
1646 | goto free_ctx; |
1647 | |
1648 | p = container_of(prange->svms, struct kfd_process, svms); |
1649 | owner = kfd_svm_page_owner(p, gpuidx: find_first_bit(addr: ctx->bitmap, |
1650 | MAX_GPU_INSTANCE)); |
1651 | for_each_set_bit(idx, ctx->bitmap, MAX_GPU_INSTANCE) { |
1652 | if (kfd_svm_page_owner(p, gpuidx: idx) != owner) { |
1653 | owner = NULL; |
1654 | break; |
1655 | } |
1656 | } |
1657 | |
1658 | start = map_start << PAGE_SHIFT; |
1659 | end = (map_last + 1) << PAGE_SHIFT; |
1660 | for (addr = start; !r && addr < end; ) { |
1661 | struct hmm_range *hmm_range; |
1662 | unsigned long map_start_vma; |
1663 | unsigned long map_last_vma; |
1664 | struct vm_area_struct *vma; |
1665 | unsigned long next = 0; |
1666 | unsigned long offset; |
1667 | unsigned long npages; |
1668 | bool readonly; |
1669 | |
1670 | vma = vma_lookup(mm, addr); |
1671 | if (vma) { |
1672 | readonly = !(vma->vm_flags & VM_WRITE); |
1673 | |
1674 | next = min(vma->vm_end, end); |
1675 | npages = (next - addr) >> PAGE_SHIFT; |
1676 | WRITE_ONCE(p->svms.faulting_task, current); |
1677 | r = amdgpu_hmm_range_get_pages(notifier: &prange->notifier, start: addr, npages, |
1678 | readonly, owner, NULL, |
1679 | phmm_range: &hmm_range); |
1680 | WRITE_ONCE(p->svms.faulting_task, NULL); |
1681 | if (r) { |
1682 | pr_debug("failed %d to get svm range pages\n" , r); |
1683 | if (r == -EBUSY) |
1684 | r = -EAGAIN; |
1685 | } |
1686 | } else { |
1687 | r = -EFAULT; |
1688 | } |
1689 | |
1690 | if (!r) { |
1691 | offset = (addr >> PAGE_SHIFT) - prange->start; |
1692 | r = svm_range_dma_map(prange, bitmap: ctx->bitmap, offset, npages, |
1693 | hmm_pfns: hmm_range->hmm_pfns); |
1694 | if (r) |
1695 | pr_debug("failed %d to dma map range\n" , r); |
1696 | } |
1697 | |
1698 | svm_range_lock(prange); |
1699 | if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) { |
1700 | pr_debug("hmm update the range, need validate again\n" ); |
1701 | r = -EAGAIN; |
1702 | } |
1703 | |
1704 | if (!r && !list_empty(head: &prange->child_list)) { |
1705 | pr_debug("range split by unmap in parallel, validate again\n" ); |
1706 | r = -EAGAIN; |
1707 | } |
1708 | |
1709 | if (!r) { |
1710 | map_start_vma = max(map_start, prange->start + offset); |
1711 | map_last_vma = min(map_last, prange->start + offset + npages - 1); |
1712 | if (map_start_vma <= map_last_vma) { |
1713 | offset = map_start_vma - prange->start; |
1714 | npages = map_last_vma - map_start_vma + 1; |
1715 | r = svm_range_map_to_gpus(prange, offset, npages, readonly, |
1716 | bitmap: ctx->bitmap, wait, flush_tlb); |
1717 | } |
1718 | } |
1719 | |
1720 | if (!r && next == end) |
1721 | prange->mapped_to_gpu = true; |
1722 | |
1723 | svm_range_unlock(prange); |
1724 | |
1725 | addr = next; |
1726 | } |
1727 | |
1728 | svm_range_unreserve_bos(ctx); |
1729 | if (!r) |
1730 | prange->validate_timestamp = ktime_get_boottime(); |
1731 | |
1732 | free_ctx: |
1733 | kfree(objp: ctx); |
1734 | |
1735 | return r; |
1736 | } |
1737 | |
1738 | /** |
1739 | * svm_range_list_lock_and_flush_work - flush pending deferred work |
1740 | * |
1741 | * @svms: the svm range list |
1742 | * @mm: the mm structure |
1743 | * |
1744 | * Context: Returns with mmap write lock held, pending deferred work flushed |
1745 | * |
1746 | */ |
1747 | void |
1748 | svm_range_list_lock_and_flush_work(struct svm_range_list *svms, |
1749 | struct mm_struct *mm) |
1750 | { |
1751 | retry_flush_work: |
1752 | flush_work(work: &svms->deferred_list_work); |
1753 | mmap_write_lock(mm); |
1754 | |
1755 | if (list_empty(head: &svms->deferred_range_list)) |
1756 | return; |
1757 | mmap_write_unlock(mm); |
1758 | pr_debug("retry flush\n" ); |
1759 | goto retry_flush_work; |
1760 | } |
1761 | |
1762 | static void svm_range_restore_work(struct work_struct *work) |
1763 | { |
1764 | struct delayed_work *dwork = to_delayed_work(work); |
1765 | struct amdkfd_process_info *process_info; |
1766 | struct svm_range_list *svms; |
1767 | struct svm_range *prange; |
1768 | struct kfd_process *p; |
1769 | struct mm_struct *mm; |
1770 | int evicted_ranges; |
1771 | int invalid; |
1772 | int r; |
1773 | |
1774 | svms = container_of(dwork, struct svm_range_list, restore_work); |
1775 | evicted_ranges = atomic_read(v: &svms->evicted_ranges); |
1776 | if (!evicted_ranges) |
1777 | return; |
1778 | |
1779 | pr_debug("restore svm ranges\n" ); |
1780 | |
1781 | p = container_of(svms, struct kfd_process, svms); |
1782 | process_info = p->kgd_process_info; |
1783 | |
1784 | /* Keep mm reference when svm_range_validate_and_map ranges */ |
1785 | mm = get_task_mm(task: p->lead_thread); |
1786 | if (!mm) { |
1787 | pr_debug("svms 0x%p process mm gone\n" , svms); |
1788 | return; |
1789 | } |
1790 | |
1791 | mutex_lock(&process_info->lock); |
1792 | svm_range_list_lock_and_flush_work(svms, mm); |
1793 | mutex_lock(&svms->lock); |
1794 | |
1795 | evicted_ranges = atomic_read(v: &svms->evicted_ranges); |
1796 | |
1797 | list_for_each_entry(prange, &svms->list, list) { |
1798 | invalid = atomic_read(v: &prange->invalid); |
1799 | if (!invalid) |
1800 | continue; |
1801 | |
1802 | pr_debug("restoring svms 0x%p prange 0x%p [0x%lx %lx] inv %d\n" , |
1803 | prange->svms, prange, prange->start, prange->last, |
1804 | invalid); |
1805 | |
1806 | /* |
1807 | * If range is migrating, wait for migration is done. |
1808 | */ |
1809 | mutex_lock(&prange->migrate_mutex); |
1810 | |
1811 | r = svm_range_validate_and_map(mm, map_start: prange->start, map_last: prange->last, prange, |
1812 | MAX_GPU_INSTANCE, intr: false, wait: true, flush_tlb: false); |
1813 | if (r) |
1814 | pr_debug("failed %d to map 0x%lx to gpus\n" , r, |
1815 | prange->start); |
1816 | |
1817 | mutex_unlock(lock: &prange->migrate_mutex); |
1818 | if (r) |
1819 | goto out_reschedule; |
1820 | |
1821 | if (atomic_cmpxchg(v: &prange->invalid, old: invalid, new: 0) != invalid) |
1822 | goto out_reschedule; |
1823 | } |
1824 | |
1825 | if (atomic_cmpxchg(v: &svms->evicted_ranges, old: evicted_ranges, new: 0) != |
1826 | evicted_ranges) |
1827 | goto out_reschedule; |
1828 | |
1829 | evicted_ranges = 0; |
1830 | |
1831 | r = kgd2kfd_resume_mm(mm); |
1832 | if (r) { |
1833 | /* No recovery from this failure. Probably the CP is |
1834 | * hanging. No point trying again. |
1835 | */ |
1836 | pr_debug("failed %d to resume KFD\n" , r); |
1837 | } |
1838 | |
1839 | pr_debug("restore svm ranges successfully\n" ); |
1840 | |
1841 | out_reschedule: |
1842 | mutex_unlock(lock: &svms->lock); |
1843 | mmap_write_unlock(mm); |
1844 | mutex_unlock(lock: &process_info->lock); |
1845 | |
1846 | /* If validation failed, reschedule another attempt */ |
1847 | if (evicted_ranges) { |
1848 | pr_debug("reschedule to restore svm range\n" ); |
1849 | queue_delayed_work(wq: system_freezable_wq, dwork: &svms->restore_work, |
1850 | delay: msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); |
1851 | |
1852 | kfd_smi_event_queue_restore_rescheduled(mm); |
1853 | } |
1854 | mmput(mm); |
1855 | } |
1856 | |
1857 | /** |
1858 | * svm_range_evict - evict svm range |
1859 | * @prange: svm range structure |
1860 | * @mm: current process mm_struct |
1861 | * @start: starting process queue number |
1862 | * @last: last process queue number |
1863 | * @event: mmu notifier event when range is evicted or migrated |
1864 | * |
1865 | * Stop all queues of the process to ensure GPU doesn't access the memory, then |
1866 | * return to let CPU evict the buffer and proceed CPU pagetable update. |
1867 | * |
1868 | * Don't need use lock to sync cpu pagetable invalidation with GPU execution. |
1869 | * If invalidation happens while restore work is running, restore work will |
1870 | * restart to ensure to get the latest CPU pages mapping to GPU, then start |
1871 | * the queues. |
1872 | */ |
1873 | static int |
1874 | svm_range_evict(struct svm_range *prange, struct mm_struct *mm, |
1875 | unsigned long start, unsigned long last, |
1876 | enum mmu_notifier_event event) |
1877 | { |
1878 | struct svm_range_list *svms = prange->svms; |
1879 | struct svm_range *pchild; |
1880 | struct kfd_process *p; |
1881 | int r = 0; |
1882 | |
1883 | p = container_of(svms, struct kfd_process, svms); |
1884 | |
1885 | pr_debug("invalidate svms 0x%p prange [0x%lx 0x%lx] [0x%lx 0x%lx]\n" , |
1886 | svms, prange->start, prange->last, start, last); |
1887 | |
1888 | if (!p->xnack_enabled || |
1889 | (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) { |
1890 | int evicted_ranges; |
1891 | bool mapped = prange->mapped_to_gpu; |
1892 | |
1893 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
1894 | if (!pchild->mapped_to_gpu) |
1895 | continue; |
1896 | mapped = true; |
1897 | mutex_lock_nested(lock: &pchild->lock, subclass: 1); |
1898 | if (pchild->start <= last && pchild->last >= start) { |
1899 | pr_debug("increment pchild invalid [0x%lx 0x%lx]\n" , |
1900 | pchild->start, pchild->last); |
1901 | atomic_inc(v: &pchild->invalid); |
1902 | } |
1903 | mutex_unlock(lock: &pchild->lock); |
1904 | } |
1905 | |
1906 | if (!mapped) |
1907 | return r; |
1908 | |
1909 | if (prange->start <= last && prange->last >= start) |
1910 | atomic_inc(v: &prange->invalid); |
1911 | |
1912 | evicted_ranges = atomic_inc_return(v: &svms->evicted_ranges); |
1913 | if (evicted_ranges != 1) |
1914 | return r; |
1915 | |
1916 | pr_debug("evicting svms 0x%p range [0x%lx 0x%lx]\n" , |
1917 | prange->svms, prange->start, prange->last); |
1918 | |
1919 | /* First eviction, stop the queues */ |
1920 | r = kgd2kfd_quiesce_mm(mm, trigger: KFD_QUEUE_EVICTION_TRIGGER_SVM); |
1921 | if (r) |
1922 | pr_debug("failed to quiesce KFD\n" ); |
1923 | |
1924 | pr_debug("schedule to restore svm %p ranges\n" , svms); |
1925 | queue_delayed_work(wq: system_freezable_wq, dwork: &svms->restore_work, |
1926 | delay: msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS)); |
1927 | } else { |
1928 | unsigned long s, l; |
1929 | uint32_t trigger; |
1930 | |
1931 | if (event == MMU_NOTIFY_MIGRATE) |
1932 | trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE; |
1933 | else |
1934 | trigger = KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY; |
1935 | |
1936 | pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n" , |
1937 | prange->svms, start, last); |
1938 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
1939 | mutex_lock_nested(lock: &pchild->lock, subclass: 1); |
1940 | s = max(start, pchild->start); |
1941 | l = min(last, pchild->last); |
1942 | if (l >= s) |
1943 | svm_range_unmap_from_gpus(prange: pchild, start: s, last: l, trigger); |
1944 | mutex_unlock(lock: &pchild->lock); |
1945 | } |
1946 | s = max(start, prange->start); |
1947 | l = min(last, prange->last); |
1948 | if (l >= s) |
1949 | svm_range_unmap_from_gpus(prange, start: s, last: l, trigger); |
1950 | } |
1951 | |
1952 | return r; |
1953 | } |
1954 | |
1955 | static struct svm_range *svm_range_clone(struct svm_range *old) |
1956 | { |
1957 | struct svm_range *new; |
1958 | |
1959 | new = svm_range_new(svms: old->svms, start: old->start, last: old->last, update_mem_usage: false); |
1960 | if (!new) |
1961 | return NULL; |
1962 | if (svm_range_copy_dma_addrs(dst: new, src: old)) { |
1963 | svm_range_free(prange: new, do_unmap: false); |
1964 | return NULL; |
1965 | } |
1966 | if (old->svm_bo) { |
1967 | new->ttm_res = old->ttm_res; |
1968 | new->offset = old->offset; |
1969 | new->svm_bo = svm_range_bo_ref(svm_bo: old->svm_bo); |
1970 | spin_lock(lock: &new->svm_bo->list_lock); |
1971 | list_add(new: &new->svm_bo_list, head: &new->svm_bo->range_list); |
1972 | spin_unlock(lock: &new->svm_bo->list_lock); |
1973 | } |
1974 | new->flags = old->flags; |
1975 | new->preferred_loc = old->preferred_loc; |
1976 | new->prefetch_loc = old->prefetch_loc; |
1977 | new->actual_loc = old->actual_loc; |
1978 | new->granularity = old->granularity; |
1979 | new->mapped_to_gpu = old->mapped_to_gpu; |
1980 | new->vram_pages = old->vram_pages; |
1981 | bitmap_copy(dst: new->bitmap_access, src: old->bitmap_access, MAX_GPU_INSTANCE); |
1982 | bitmap_copy(dst: new->bitmap_aip, src: old->bitmap_aip, MAX_GPU_INSTANCE); |
1983 | |
1984 | return new; |
1985 | } |
1986 | |
1987 | void svm_range_set_max_pages(struct amdgpu_device *adev) |
1988 | { |
1989 | uint64_t max_pages; |
1990 | uint64_t pages, _pages; |
1991 | uint64_t min_pages = 0; |
1992 | int i, id; |
1993 | |
1994 | for (i = 0; i < adev->kfd.dev->num_nodes; i++) { |
1995 | if (adev->kfd.dev->nodes[i]->xcp) |
1996 | id = adev->kfd.dev->nodes[i]->xcp->id; |
1997 | else |
1998 | id = -1; |
1999 | pages = KFD_XCP_MEMORY_SIZE(adev, id) >> 17; |
2000 | pages = clamp(pages, 1ULL << 9, 1ULL << 18); |
2001 | pages = rounddown_pow_of_two(pages); |
2002 | min_pages = min_not_zero(min_pages, pages); |
2003 | } |
2004 | |
2005 | do { |
2006 | max_pages = READ_ONCE(max_svm_range_pages); |
2007 | _pages = min_not_zero(max_pages, min_pages); |
2008 | } while (cmpxchg(&max_svm_range_pages, max_pages, _pages) != max_pages); |
2009 | } |
2010 | |
2011 | static int |
2012 | svm_range_split_new(struct svm_range_list *svms, uint64_t start, uint64_t last, |
2013 | uint64_t max_pages, struct list_head *insert_list, |
2014 | struct list_head *update_list) |
2015 | { |
2016 | struct svm_range *prange; |
2017 | uint64_t l; |
2018 | |
2019 | pr_debug("max_svm_range_pages 0x%llx adding [0x%llx 0x%llx]\n" , |
2020 | max_pages, start, last); |
2021 | |
2022 | while (last >= start) { |
2023 | l = min(last, ALIGN_DOWN(start + max_pages, max_pages) - 1); |
2024 | |
2025 | prange = svm_range_new(svms, start, last: l, update_mem_usage: true); |
2026 | if (!prange) |
2027 | return -ENOMEM; |
2028 | list_add(new: &prange->list, head: insert_list); |
2029 | list_add(new: &prange->update_list, head: update_list); |
2030 | |
2031 | start = l + 1; |
2032 | } |
2033 | return 0; |
2034 | } |
2035 | |
2036 | /** |
2037 | * svm_range_add - add svm range and handle overlap |
2038 | * @p: the range add to this process svms |
2039 | * @start: page size aligned |
2040 | * @size: page size aligned |
2041 | * @nattr: number of attributes |
2042 | * @attrs: array of attributes |
2043 | * @update_list: output, the ranges need validate and update GPU mapping |
2044 | * @insert_list: output, the ranges need insert to svms |
2045 | * @remove_list: output, the ranges are replaced and need remove from svms |
2046 | * @remap_list: output, remap unaligned svm ranges |
2047 | * |
2048 | * Check if the virtual address range has overlap with any existing ranges, |
2049 | * split partly overlapping ranges and add new ranges in the gaps. All changes |
2050 | * should be applied to the range_list and interval tree transactionally. If |
2051 | * any range split or allocation fails, the entire update fails. Therefore any |
2052 | * existing overlapping svm_ranges are cloned and the original svm_ranges left |
2053 | * unchanged. |
2054 | * |
2055 | * If the transaction succeeds, the caller can update and insert clones and |
2056 | * new ranges, then free the originals. |
2057 | * |
2058 | * Otherwise the caller can free the clones and new ranges, while the old |
2059 | * svm_ranges remain unchanged. |
2060 | * |
2061 | * Context: Process context, caller must hold svms->lock |
2062 | * |
2063 | * Return: |
2064 | * 0 - OK, otherwise error code |
2065 | */ |
2066 | static int |
2067 | svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size, |
2068 | uint32_t nattr, struct kfd_ioctl_svm_attribute *attrs, |
2069 | struct list_head *update_list, struct list_head *insert_list, |
2070 | struct list_head *remove_list, struct list_head *remap_list) |
2071 | { |
2072 | unsigned long last = start + size - 1UL; |
2073 | struct svm_range_list *svms = &p->svms; |
2074 | struct interval_tree_node *node; |
2075 | struct svm_range *prange; |
2076 | struct svm_range *tmp; |
2077 | struct list_head new_list; |
2078 | int r = 0; |
2079 | |
2080 | pr_debug("svms 0x%p [0x%llx 0x%lx]\n" , &p->svms, start, last); |
2081 | |
2082 | INIT_LIST_HEAD(list: update_list); |
2083 | INIT_LIST_HEAD(list: insert_list); |
2084 | INIT_LIST_HEAD(list: remove_list); |
2085 | INIT_LIST_HEAD(list: &new_list); |
2086 | INIT_LIST_HEAD(list: remap_list); |
2087 | |
2088 | node = interval_tree_iter_first(root: &svms->objects, start, last); |
2089 | while (node) { |
2090 | struct interval_tree_node *next; |
2091 | unsigned long next_start; |
2092 | |
2093 | pr_debug("found overlap node [0x%lx 0x%lx]\n" , node->start, |
2094 | node->last); |
2095 | |
2096 | prange = container_of(node, struct svm_range, it_node); |
2097 | next = interval_tree_iter_next(node, start, last); |
2098 | next_start = min(node->last, last) + 1; |
2099 | |
2100 | if (svm_range_is_same_attrs(p, prange, nattr, attrs) && |
2101 | prange->mapped_to_gpu) { |
2102 | /* nothing to do */ |
2103 | } else if (node->start < start || node->last > last) { |
2104 | /* node intersects the update range and its attributes |
2105 | * will change. Clone and split it, apply updates only |
2106 | * to the overlapping part |
2107 | */ |
2108 | struct svm_range *old = prange; |
2109 | |
2110 | prange = svm_range_clone(old); |
2111 | if (!prange) { |
2112 | r = -ENOMEM; |
2113 | goto out; |
2114 | } |
2115 | |
2116 | list_add(new: &old->update_list, head: remove_list); |
2117 | list_add(new: &prange->list, head: insert_list); |
2118 | list_add(new: &prange->update_list, head: update_list); |
2119 | |
2120 | if (node->start < start) { |
2121 | pr_debug("change old range start\n" ); |
2122 | r = svm_range_split_head(prange, new_start: start, |
2123 | insert_list, remap_list); |
2124 | if (r) |
2125 | goto out; |
2126 | } |
2127 | if (node->last > last) { |
2128 | pr_debug("change old range last\n" ); |
2129 | r = svm_range_split_tail(prange, new_last: last, |
2130 | insert_list, remap_list); |
2131 | if (r) |
2132 | goto out; |
2133 | } |
2134 | } else { |
2135 | /* The node is contained within start..last, |
2136 | * just update it |
2137 | */ |
2138 | list_add(new: &prange->update_list, head: update_list); |
2139 | } |
2140 | |
2141 | /* insert a new node if needed */ |
2142 | if (node->start > start) { |
2143 | r = svm_range_split_new(svms, start, last: node->start - 1, |
2144 | READ_ONCE(max_svm_range_pages), |
2145 | insert_list: &new_list, update_list); |
2146 | if (r) |
2147 | goto out; |
2148 | } |
2149 | |
2150 | node = next; |
2151 | start = next_start; |
2152 | } |
2153 | |
2154 | /* add a final range at the end if needed */ |
2155 | if (start <= last) |
2156 | r = svm_range_split_new(svms, start, last, |
2157 | READ_ONCE(max_svm_range_pages), |
2158 | insert_list: &new_list, update_list); |
2159 | |
2160 | out: |
2161 | if (r) { |
2162 | list_for_each_entry_safe(prange, tmp, insert_list, list) |
2163 | svm_range_free(prange, do_unmap: false); |
2164 | list_for_each_entry_safe(prange, tmp, &new_list, list) |
2165 | svm_range_free(prange, do_unmap: true); |
2166 | } else { |
2167 | list_splice(list: &new_list, head: insert_list); |
2168 | } |
2169 | |
2170 | return r; |
2171 | } |
2172 | |
2173 | static void |
2174 | svm_range_update_notifier_and_interval_tree(struct mm_struct *mm, |
2175 | struct svm_range *prange) |
2176 | { |
2177 | unsigned long start; |
2178 | unsigned long last; |
2179 | |
2180 | start = prange->notifier.interval_tree.start >> PAGE_SHIFT; |
2181 | last = prange->notifier.interval_tree.last >> PAGE_SHIFT; |
2182 | |
2183 | if (prange->start == start && prange->last == last) |
2184 | return; |
2185 | |
2186 | pr_debug("up notifier 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n" , |
2187 | prange->svms, prange, start, last, prange->start, |
2188 | prange->last); |
2189 | |
2190 | if (start != 0 && last != 0) { |
2191 | interval_tree_remove(node: &prange->it_node, root: &prange->svms->objects); |
2192 | svm_range_remove_notifier(prange); |
2193 | } |
2194 | prange->it_node.start = prange->start; |
2195 | prange->it_node.last = prange->last; |
2196 | |
2197 | interval_tree_insert(node: &prange->it_node, root: &prange->svms->objects); |
2198 | svm_range_add_notifier_locked(mm, prange); |
2199 | } |
2200 | |
2201 | static void |
2202 | svm_range_handle_list_op(struct svm_range_list *svms, struct svm_range *prange, |
2203 | struct mm_struct *mm) |
2204 | { |
2205 | switch (prange->work_item.op) { |
2206 | case SVM_OP_NULL: |
2207 | pr_debug("NULL OP 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
2208 | svms, prange, prange->start, prange->last); |
2209 | break; |
2210 | case SVM_OP_UNMAP_RANGE: |
2211 | pr_debug("remove 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
2212 | svms, prange, prange->start, prange->last); |
2213 | svm_range_unlink(prange); |
2214 | svm_range_remove_notifier(prange); |
2215 | svm_range_free(prange, do_unmap: true); |
2216 | break; |
2217 | case SVM_OP_UPDATE_RANGE_NOTIFIER: |
2218 | pr_debug("update notifier 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
2219 | svms, prange, prange->start, prange->last); |
2220 | svm_range_update_notifier_and_interval_tree(mm, prange); |
2221 | break; |
2222 | case SVM_OP_UPDATE_RANGE_NOTIFIER_AND_MAP: |
2223 | pr_debug("update and map 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
2224 | svms, prange, prange->start, prange->last); |
2225 | svm_range_update_notifier_and_interval_tree(mm, prange); |
2226 | /* TODO: implement deferred validation and mapping */ |
2227 | break; |
2228 | case SVM_OP_ADD_RANGE: |
2229 | pr_debug("add 0x%p prange 0x%p [0x%lx 0x%lx]\n" , svms, prange, |
2230 | prange->start, prange->last); |
2231 | svm_range_add_to_svms(prange); |
2232 | svm_range_add_notifier_locked(mm, prange); |
2233 | break; |
2234 | case SVM_OP_ADD_RANGE_AND_MAP: |
2235 | pr_debug("add and map 0x%p prange 0x%p [0x%lx 0x%lx]\n" , svms, |
2236 | prange, prange->start, prange->last); |
2237 | svm_range_add_to_svms(prange); |
2238 | svm_range_add_notifier_locked(mm, prange); |
2239 | /* TODO: implement deferred validation and mapping */ |
2240 | break; |
2241 | default: |
2242 | WARN_ONCE(1, "Unknown prange 0x%p work op %d\n" , prange, |
2243 | prange->work_item.op); |
2244 | } |
2245 | } |
2246 | |
2247 | static void svm_range_drain_retry_fault(struct svm_range_list *svms) |
2248 | { |
2249 | struct kfd_process_device *pdd; |
2250 | struct kfd_process *p; |
2251 | int drain; |
2252 | uint32_t i; |
2253 | |
2254 | p = container_of(svms, struct kfd_process, svms); |
2255 | |
2256 | restart: |
2257 | drain = atomic_read(v: &svms->drain_pagefaults); |
2258 | if (!drain) |
2259 | return; |
2260 | |
2261 | for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) { |
2262 | pdd = p->pdds[i]; |
2263 | if (!pdd) |
2264 | continue; |
2265 | |
2266 | pr_debug("drain retry fault gpu %d svms %p\n" , i, svms); |
2267 | |
2268 | amdgpu_ih_wait_on_checkpoint_process_ts(adev: pdd->dev->adev, |
2269 | ih: pdd->dev->adev->irq.retry_cam_enabled ? |
2270 | &pdd->dev->adev->irq.ih : |
2271 | &pdd->dev->adev->irq.ih1); |
2272 | |
2273 | if (pdd->dev->adev->irq.retry_cam_enabled) |
2274 | amdgpu_ih_wait_on_checkpoint_process_ts(adev: pdd->dev->adev, |
2275 | ih: &pdd->dev->adev->irq.ih_soft); |
2276 | |
2277 | |
2278 | pr_debug("drain retry fault gpu %d svms 0x%p done\n" , i, svms); |
2279 | } |
2280 | if (atomic_cmpxchg(v: &svms->drain_pagefaults, old: drain, new: 0) != drain) |
2281 | goto restart; |
2282 | } |
2283 | |
2284 | static void svm_range_deferred_list_work(struct work_struct *work) |
2285 | { |
2286 | struct svm_range_list *svms; |
2287 | struct svm_range *prange; |
2288 | struct mm_struct *mm; |
2289 | |
2290 | svms = container_of(work, struct svm_range_list, deferred_list_work); |
2291 | pr_debug("enter svms 0x%p\n" , svms); |
2292 | |
2293 | spin_lock(lock: &svms->deferred_list_lock); |
2294 | while (!list_empty(head: &svms->deferred_range_list)) { |
2295 | prange = list_first_entry(&svms->deferred_range_list, |
2296 | struct svm_range, deferred_list); |
2297 | spin_unlock(lock: &svms->deferred_list_lock); |
2298 | |
2299 | pr_debug("prange 0x%p [0x%lx 0x%lx] op %d\n" , prange, |
2300 | prange->start, prange->last, prange->work_item.op); |
2301 | |
2302 | mm = prange->work_item.mm; |
2303 | retry: |
2304 | mmap_write_lock(mm); |
2305 | |
2306 | /* Checking for the need to drain retry faults must be inside |
2307 | * mmap write lock to serialize with munmap notifiers. |
2308 | */ |
2309 | if (unlikely(atomic_read(&svms->drain_pagefaults))) { |
2310 | mmap_write_unlock(mm); |
2311 | svm_range_drain_retry_fault(svms); |
2312 | goto retry; |
2313 | } |
2314 | |
2315 | /* Remove from deferred_list must be inside mmap write lock, for |
2316 | * two race cases: |
2317 | * 1. unmap_from_cpu may change work_item.op and add the range |
2318 | * to deferred_list again, cause use after free bug. |
2319 | * 2. svm_range_list_lock_and_flush_work may hold mmap write |
2320 | * lock and continue because deferred_list is empty, but |
2321 | * deferred_list work is actually waiting for mmap lock. |
2322 | */ |
2323 | spin_lock(lock: &svms->deferred_list_lock); |
2324 | list_del_init(entry: &prange->deferred_list); |
2325 | spin_unlock(lock: &svms->deferred_list_lock); |
2326 | |
2327 | mutex_lock(&svms->lock); |
2328 | mutex_lock(&prange->migrate_mutex); |
2329 | while (!list_empty(head: &prange->child_list)) { |
2330 | struct svm_range *pchild; |
2331 | |
2332 | pchild = list_first_entry(&prange->child_list, |
2333 | struct svm_range, child_list); |
2334 | pr_debug("child prange 0x%p op %d\n" , pchild, |
2335 | pchild->work_item.op); |
2336 | list_del_init(entry: &pchild->child_list); |
2337 | svm_range_handle_list_op(svms, prange: pchild, mm); |
2338 | } |
2339 | mutex_unlock(lock: &prange->migrate_mutex); |
2340 | |
2341 | svm_range_handle_list_op(svms, prange, mm); |
2342 | mutex_unlock(lock: &svms->lock); |
2343 | mmap_write_unlock(mm); |
2344 | |
2345 | /* Pairs with mmget in svm_range_add_list_work. If dropping the |
2346 | * last mm refcount, schedule release work to avoid circular locking |
2347 | */ |
2348 | mmput_async(mm); |
2349 | |
2350 | spin_lock(lock: &svms->deferred_list_lock); |
2351 | } |
2352 | spin_unlock(lock: &svms->deferred_list_lock); |
2353 | pr_debug("exit svms 0x%p\n" , svms); |
2354 | } |
2355 | |
2356 | void |
2357 | svm_range_add_list_work(struct svm_range_list *svms, struct svm_range *prange, |
2358 | struct mm_struct *mm, enum svm_work_list_ops op) |
2359 | { |
2360 | spin_lock(lock: &svms->deferred_list_lock); |
2361 | /* if prange is on the deferred list */ |
2362 | if (!list_empty(head: &prange->deferred_list)) { |
2363 | pr_debug("update exist prange 0x%p work op %d\n" , prange, op); |
2364 | WARN_ONCE(prange->work_item.mm != mm, "unmatch mm\n" ); |
2365 | if (op != SVM_OP_NULL && |
2366 | prange->work_item.op != SVM_OP_UNMAP_RANGE) |
2367 | prange->work_item.op = op; |
2368 | } else { |
2369 | prange->work_item.op = op; |
2370 | |
2371 | /* Pairs with mmput in deferred_list_work */ |
2372 | mmget(mm); |
2373 | prange->work_item.mm = mm; |
2374 | list_add_tail(new: &prange->deferred_list, |
2375 | head: &prange->svms->deferred_range_list); |
2376 | pr_debug("add prange 0x%p [0x%lx 0x%lx] to work list op %d\n" , |
2377 | prange, prange->start, prange->last, op); |
2378 | } |
2379 | spin_unlock(lock: &svms->deferred_list_lock); |
2380 | } |
2381 | |
2382 | void schedule_deferred_list_work(struct svm_range_list *svms) |
2383 | { |
2384 | spin_lock(lock: &svms->deferred_list_lock); |
2385 | if (!list_empty(head: &svms->deferred_range_list)) |
2386 | schedule_work(work: &svms->deferred_list_work); |
2387 | spin_unlock(lock: &svms->deferred_list_lock); |
2388 | } |
2389 | |
2390 | static void |
2391 | svm_range_unmap_split(struct mm_struct *mm, struct svm_range *parent, |
2392 | struct svm_range *prange, unsigned long start, |
2393 | unsigned long last) |
2394 | { |
2395 | struct svm_range *head; |
2396 | struct svm_range *tail; |
2397 | |
2398 | if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { |
2399 | pr_debug("prange 0x%p [0x%lx 0x%lx] is already freed\n" , prange, |
2400 | prange->start, prange->last); |
2401 | return; |
2402 | } |
2403 | if (start > prange->last || last < prange->start) |
2404 | return; |
2405 | |
2406 | head = tail = prange; |
2407 | if (start > prange->start) |
2408 | svm_range_split(prange, start: prange->start, last: start - 1, new: &tail); |
2409 | if (last < tail->last) |
2410 | svm_range_split(prange: tail, start: last + 1, last: tail->last, new: &head); |
2411 | |
2412 | if (head != prange && tail != prange) { |
2413 | svm_range_add_child(prange: parent, mm, pchild: head, op: SVM_OP_UNMAP_RANGE); |
2414 | svm_range_add_child(prange: parent, mm, pchild: tail, op: SVM_OP_ADD_RANGE); |
2415 | } else if (tail != prange) { |
2416 | svm_range_add_child(prange: parent, mm, pchild: tail, op: SVM_OP_UNMAP_RANGE); |
2417 | } else if (head != prange) { |
2418 | svm_range_add_child(prange: parent, mm, pchild: head, op: SVM_OP_UNMAP_RANGE); |
2419 | } else if (parent != prange) { |
2420 | prange->work_item.op = SVM_OP_UNMAP_RANGE; |
2421 | } |
2422 | } |
2423 | |
2424 | static void |
2425 | svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange, |
2426 | unsigned long start, unsigned long last) |
2427 | { |
2428 | uint32_t trigger = KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU; |
2429 | struct svm_range_list *svms; |
2430 | struct svm_range *pchild; |
2431 | struct kfd_process *p; |
2432 | unsigned long s, l; |
2433 | bool unmap_parent; |
2434 | |
2435 | p = kfd_lookup_process_by_mm(mm); |
2436 | if (!p) |
2437 | return; |
2438 | svms = &p->svms; |
2439 | |
2440 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n" , svms, |
2441 | prange, prange->start, prange->last, start, last); |
2442 | |
2443 | /* Make sure pending page faults are drained in the deferred worker |
2444 | * before the range is freed to avoid straggler interrupts on |
2445 | * unmapped memory causing "phantom faults". |
2446 | */ |
2447 | atomic_inc(v: &svms->drain_pagefaults); |
2448 | |
2449 | unmap_parent = start <= prange->start && last >= prange->last; |
2450 | |
2451 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
2452 | mutex_lock_nested(lock: &pchild->lock, subclass: 1); |
2453 | s = max(start, pchild->start); |
2454 | l = min(last, pchild->last); |
2455 | if (l >= s) |
2456 | svm_range_unmap_from_gpus(prange: pchild, start: s, last: l, trigger); |
2457 | svm_range_unmap_split(mm, parent: prange, prange: pchild, start, last); |
2458 | mutex_unlock(lock: &pchild->lock); |
2459 | } |
2460 | s = max(start, prange->start); |
2461 | l = min(last, prange->last); |
2462 | if (l >= s) |
2463 | svm_range_unmap_from_gpus(prange, start: s, last: l, trigger); |
2464 | svm_range_unmap_split(mm, parent: prange, prange, start, last); |
2465 | |
2466 | if (unmap_parent) |
2467 | svm_range_add_list_work(svms, prange, mm, op: SVM_OP_UNMAP_RANGE); |
2468 | else |
2469 | svm_range_add_list_work(svms, prange, mm, |
2470 | op: SVM_OP_UPDATE_RANGE_NOTIFIER); |
2471 | schedule_deferred_list_work(svms); |
2472 | |
2473 | kfd_unref_process(p); |
2474 | } |
2475 | |
2476 | /** |
2477 | * svm_range_cpu_invalidate_pagetables - interval notifier callback |
2478 | * @mni: mmu_interval_notifier struct |
2479 | * @range: mmu_notifier_range struct |
2480 | * @cur_seq: value to pass to mmu_interval_set_seq() |
2481 | * |
2482 | * If event is MMU_NOTIFY_UNMAP, this is from CPU unmap range, otherwise, it |
2483 | * is from migration, or CPU page invalidation callback. |
2484 | * |
2485 | * For unmap event, unmap range from GPUs, remove prange from svms in a delayed |
2486 | * work thread, and split prange if only part of prange is unmapped. |
2487 | * |
2488 | * For invalidation event, if GPU retry fault is not enabled, evict the queues, |
2489 | * then schedule svm_range_restore_work to update GPU mapping and resume queues. |
2490 | * If GPU retry fault is enabled, unmap the svm range from GPU, retry fault will |
2491 | * update GPU mapping to recover. |
2492 | * |
2493 | * Context: mmap lock, notifier_invalidate_start lock are held |
2494 | * for invalidate event, prange lock is held if this is from migration |
2495 | */ |
2496 | static bool |
2497 | svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni, |
2498 | const struct mmu_notifier_range *range, |
2499 | unsigned long cur_seq) |
2500 | { |
2501 | struct svm_range *prange; |
2502 | unsigned long start; |
2503 | unsigned long last; |
2504 | |
2505 | if (range->event == MMU_NOTIFY_RELEASE) |
2506 | return true; |
2507 | if (!mmget_not_zero(mm: mni->mm)) |
2508 | return true; |
2509 | |
2510 | start = mni->interval_tree.start; |
2511 | last = mni->interval_tree.last; |
2512 | start = max(start, range->start) >> PAGE_SHIFT; |
2513 | last = min(last, range->end - 1) >> PAGE_SHIFT; |
2514 | pr_debug("[0x%lx 0x%lx] range[0x%lx 0x%lx] notifier[0x%lx 0x%lx] %d\n" , |
2515 | start, last, range->start >> PAGE_SHIFT, |
2516 | (range->end - 1) >> PAGE_SHIFT, |
2517 | mni->interval_tree.start >> PAGE_SHIFT, |
2518 | mni->interval_tree.last >> PAGE_SHIFT, range->event); |
2519 | |
2520 | prange = container_of(mni, struct svm_range, notifier); |
2521 | |
2522 | svm_range_lock(prange); |
2523 | mmu_interval_set_seq(interval_sub: mni, cur_seq); |
2524 | |
2525 | switch (range->event) { |
2526 | case MMU_NOTIFY_UNMAP: |
2527 | svm_range_unmap_from_cpu(mm: mni->mm, prange, start, last); |
2528 | break; |
2529 | default: |
2530 | svm_range_evict(prange, mm: mni->mm, start, last, event: range->event); |
2531 | break; |
2532 | } |
2533 | |
2534 | svm_range_unlock(prange); |
2535 | mmput(mni->mm); |
2536 | |
2537 | return true; |
2538 | } |
2539 | |
2540 | /** |
2541 | * svm_range_from_addr - find svm range from fault address |
2542 | * @svms: svm range list header |
2543 | * @addr: address to search range interval tree, in pages |
2544 | * @parent: parent range if range is on child list |
2545 | * |
2546 | * Context: The caller must hold svms->lock |
2547 | * |
2548 | * Return: the svm_range found or NULL |
2549 | */ |
2550 | struct svm_range * |
2551 | svm_range_from_addr(struct svm_range_list *svms, unsigned long addr, |
2552 | struct svm_range **parent) |
2553 | { |
2554 | struct interval_tree_node *node; |
2555 | struct svm_range *prange; |
2556 | struct svm_range *pchild; |
2557 | |
2558 | node = interval_tree_iter_first(root: &svms->objects, start: addr, last: addr); |
2559 | if (!node) |
2560 | return NULL; |
2561 | |
2562 | prange = container_of(node, struct svm_range, it_node); |
2563 | pr_debug("address 0x%lx prange [0x%lx 0x%lx] node [0x%lx 0x%lx]\n" , |
2564 | addr, prange->start, prange->last, node->start, node->last); |
2565 | |
2566 | if (addr >= prange->start && addr <= prange->last) { |
2567 | if (parent) |
2568 | *parent = prange; |
2569 | return prange; |
2570 | } |
2571 | list_for_each_entry(pchild, &prange->child_list, child_list) |
2572 | if (addr >= pchild->start && addr <= pchild->last) { |
2573 | pr_debug("found address 0x%lx pchild [0x%lx 0x%lx]\n" , |
2574 | addr, pchild->start, pchild->last); |
2575 | if (parent) |
2576 | *parent = prange; |
2577 | return pchild; |
2578 | } |
2579 | |
2580 | return NULL; |
2581 | } |
2582 | |
2583 | /* svm_range_best_restore_location - decide the best fault restore location |
2584 | * @prange: svm range structure |
2585 | * @adev: the GPU on which vm fault happened |
2586 | * |
2587 | * This is only called when xnack is on, to decide the best location to restore |
2588 | * the range mapping after GPU vm fault. Caller uses the best location to do |
2589 | * migration if actual loc is not best location, then update GPU page table |
2590 | * mapping to the best location. |
2591 | * |
2592 | * If the preferred loc is accessible by faulting GPU, use preferred loc. |
2593 | * If vm fault gpu idx is on range ACCESSIBLE bitmap, best_loc is vm fault gpu |
2594 | * If vm fault gpu idx is on range ACCESSIBLE_IN_PLACE bitmap, then |
2595 | * if range actual loc is cpu, best_loc is cpu |
2596 | * if vm fault gpu is on xgmi same hive of range actual loc gpu, best_loc is |
2597 | * range actual loc. |
2598 | * Otherwise, GPU no access, best_loc is -1. |
2599 | * |
2600 | * Return: |
2601 | * -1 means vm fault GPU no access |
2602 | * 0 for CPU or GPU id |
2603 | */ |
2604 | static int32_t |
2605 | svm_range_best_restore_location(struct svm_range *prange, |
2606 | struct kfd_node *node, |
2607 | int32_t *gpuidx) |
2608 | { |
2609 | struct kfd_node *bo_node, *preferred_node; |
2610 | struct kfd_process *p; |
2611 | uint32_t gpuid; |
2612 | int r; |
2613 | |
2614 | p = container_of(prange->svms, struct kfd_process, svms); |
2615 | |
2616 | r = kfd_process_gpuid_from_node(p, node, gpuid: &gpuid, gpuidx); |
2617 | if (r < 0) { |
2618 | pr_debug("failed to get gpuid from kgd\n" ); |
2619 | return -1; |
2620 | } |
2621 | |
2622 | if (node->adev->gmc.is_app_apu) |
2623 | return 0; |
2624 | |
2625 | if (prange->preferred_loc == gpuid || |
2626 | prange->preferred_loc == KFD_IOCTL_SVM_LOCATION_SYSMEM) { |
2627 | return prange->preferred_loc; |
2628 | } else if (prange->preferred_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED) { |
2629 | preferred_node = svm_range_get_node_by_id(prange, gpu_id: prange->preferred_loc); |
2630 | if (preferred_node && svm_nodes_in_same_hive(node_a: node, node_b: preferred_node)) |
2631 | return prange->preferred_loc; |
2632 | /* fall through */ |
2633 | } |
2634 | |
2635 | if (test_bit(*gpuidx, prange->bitmap_access)) |
2636 | return gpuid; |
2637 | |
2638 | if (test_bit(*gpuidx, prange->bitmap_aip)) { |
2639 | if (!prange->actual_loc) |
2640 | return 0; |
2641 | |
2642 | bo_node = svm_range_get_node_by_id(prange, gpu_id: prange->actual_loc); |
2643 | if (bo_node && svm_nodes_in_same_hive(node_a: node, node_b: bo_node)) |
2644 | return prange->actual_loc; |
2645 | else |
2646 | return 0; |
2647 | } |
2648 | |
2649 | return -1; |
2650 | } |
2651 | |
2652 | static int |
2653 | svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, |
2654 | unsigned long *start, unsigned long *last, |
2655 | bool *is_heap_stack) |
2656 | { |
2657 | struct vm_area_struct *vma; |
2658 | struct interval_tree_node *node; |
2659 | struct rb_node *rb_node; |
2660 | unsigned long start_limit, end_limit; |
2661 | |
2662 | vma = vma_lookup(mm: p->mm, addr: addr << PAGE_SHIFT); |
2663 | if (!vma) { |
2664 | pr_debug("VMA does not exist in address [0x%llx]\n" , addr); |
2665 | return -EFAULT; |
2666 | } |
2667 | |
2668 | *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); |
2669 | |
2670 | start_limit = max(vma->vm_start >> PAGE_SHIFT, |
2671 | (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); |
2672 | end_limit = min(vma->vm_end >> PAGE_SHIFT, |
2673 | (unsigned long)ALIGN(addr + 1, 2UL << 8)); |
2674 | /* First range that starts after the fault address */ |
2675 | node = interval_tree_iter_first(root: &p->svms.objects, start: addr + 1, ULONG_MAX); |
2676 | if (node) { |
2677 | end_limit = min(end_limit, node->start); |
2678 | /* Last range that ends before the fault address */ |
2679 | rb_node = rb_prev(&node->rb); |
2680 | } else { |
2681 | /* Last range must end before addr because |
2682 | * there was no range after addr |
2683 | */ |
2684 | rb_node = rb_last(&p->svms.objects.rb_root); |
2685 | } |
2686 | if (rb_node) { |
2687 | node = container_of(rb_node, struct interval_tree_node, rb); |
2688 | if (node->last >= addr) { |
2689 | WARN(1, "Overlap with prev node and page fault addr\n" ); |
2690 | return -EFAULT; |
2691 | } |
2692 | start_limit = max(start_limit, node->last + 1); |
2693 | } |
2694 | |
2695 | *start = start_limit; |
2696 | *last = end_limit - 1; |
2697 | |
2698 | pr_debug("vma [0x%lx 0x%lx] range [0x%lx 0x%lx] is_heap_stack %d\n" , |
2699 | vma->vm_start >> PAGE_SHIFT, vma->vm_end >> PAGE_SHIFT, |
2700 | *start, *last, *is_heap_stack); |
2701 | |
2702 | return 0; |
2703 | } |
2704 | |
2705 | static int |
2706 | svm_range_check_vm_userptr(struct kfd_process *p, uint64_t start, uint64_t last, |
2707 | uint64_t *bo_s, uint64_t *bo_l) |
2708 | { |
2709 | struct amdgpu_bo_va_mapping *mapping; |
2710 | struct interval_tree_node *node; |
2711 | struct amdgpu_bo *bo = NULL; |
2712 | unsigned long userptr; |
2713 | uint32_t i; |
2714 | int r; |
2715 | |
2716 | for (i = 0; i < p->n_pdds; i++) { |
2717 | struct amdgpu_vm *vm; |
2718 | |
2719 | if (!p->pdds[i]->drm_priv) |
2720 | continue; |
2721 | |
2722 | vm = drm_priv_to_vm(p->pdds[i]->drm_priv); |
2723 | r = amdgpu_bo_reserve(bo: vm->root.bo, no_intr: false); |
2724 | if (r) |
2725 | return r; |
2726 | |
2727 | /* Check userptr by searching entire vm->va interval tree */ |
2728 | node = interval_tree_iter_first(root: &vm->va, start: 0, last: ~0ULL); |
2729 | while (node) { |
2730 | mapping = container_of((struct rb_node *)node, |
2731 | struct amdgpu_bo_va_mapping, rb); |
2732 | bo = mapping->bo_va->base.bo; |
2733 | |
2734 | if (!amdgpu_ttm_tt_affect_userptr(ttm: bo->tbo.ttm, |
2735 | start: start << PAGE_SHIFT, |
2736 | end: last << PAGE_SHIFT, |
2737 | userptr: &userptr)) { |
2738 | node = interval_tree_iter_next(node, start: 0, last: ~0ULL); |
2739 | continue; |
2740 | } |
2741 | |
2742 | pr_debug("[0x%llx 0x%llx] already userptr mapped\n" , |
2743 | start, last); |
2744 | if (bo_s && bo_l) { |
2745 | *bo_s = userptr >> PAGE_SHIFT; |
2746 | *bo_l = *bo_s + bo->tbo.ttm->num_pages - 1; |
2747 | } |
2748 | amdgpu_bo_unreserve(bo: vm->root.bo); |
2749 | return -EADDRINUSE; |
2750 | } |
2751 | amdgpu_bo_unreserve(bo: vm->root.bo); |
2752 | } |
2753 | return 0; |
2754 | } |
2755 | |
2756 | static struct |
2757 | svm_range *svm_range_create_unregistered_range(struct kfd_node *node, |
2758 | struct kfd_process *p, |
2759 | struct mm_struct *mm, |
2760 | int64_t addr) |
2761 | { |
2762 | struct svm_range *prange = NULL; |
2763 | unsigned long start, last; |
2764 | uint32_t gpuid, gpuidx; |
2765 | bool is_heap_stack; |
2766 | uint64_t bo_s = 0; |
2767 | uint64_t bo_l = 0; |
2768 | int r; |
2769 | |
2770 | if (svm_range_get_range_boundaries(p, addr, start: &start, last: &last, |
2771 | is_heap_stack: &is_heap_stack)) |
2772 | return NULL; |
2773 | |
2774 | r = svm_range_check_vm(p, start, last, bo_s: &bo_s, bo_l: &bo_l); |
2775 | if (r != -EADDRINUSE) |
2776 | r = svm_range_check_vm_userptr(p, start, last, bo_s: &bo_s, bo_l: &bo_l); |
2777 | |
2778 | if (r == -EADDRINUSE) { |
2779 | if (addr >= bo_s && addr <= bo_l) |
2780 | return NULL; |
2781 | |
2782 | /* Create one page svm range if 2MB range overlapping */ |
2783 | start = addr; |
2784 | last = addr; |
2785 | } |
2786 | |
2787 | prange = svm_range_new(svms: &p->svms, start, last, update_mem_usage: true); |
2788 | if (!prange) { |
2789 | pr_debug("Failed to create prange in address [0x%llx]\n" , addr); |
2790 | return NULL; |
2791 | } |
2792 | if (kfd_process_gpuid_from_node(p, node, gpuid: &gpuid, gpuidx: &gpuidx)) { |
2793 | pr_debug("failed to get gpuid from kgd\n" ); |
2794 | svm_range_free(prange, do_unmap: true); |
2795 | return NULL; |
2796 | } |
2797 | |
2798 | if (is_heap_stack) |
2799 | prange->preferred_loc = KFD_IOCTL_SVM_LOCATION_SYSMEM; |
2800 | |
2801 | svm_range_add_to_svms(prange); |
2802 | svm_range_add_notifier_locked(mm, prange); |
2803 | |
2804 | return prange; |
2805 | } |
2806 | |
2807 | /* svm_range_skip_recover - decide if prange can be recovered |
2808 | * @prange: svm range structure |
2809 | * |
2810 | * GPU vm retry fault handle skip recover the range for cases: |
2811 | * 1. prange is on deferred list to be removed after unmap, it is stale fault, |
2812 | * deferred list work will drain the stale fault before free the prange. |
2813 | * 2. prange is on deferred list to add interval notifier after split, or |
2814 | * 3. prange is child range, it is split from parent prange, recover later |
2815 | * after interval notifier is added. |
2816 | * |
2817 | * Return: true to skip recover, false to recover |
2818 | */ |
2819 | static bool svm_range_skip_recover(struct svm_range *prange) |
2820 | { |
2821 | struct svm_range_list *svms = prange->svms; |
2822 | |
2823 | spin_lock(lock: &svms->deferred_list_lock); |
2824 | if (list_empty(head: &prange->deferred_list) && |
2825 | list_empty(head: &prange->child_list)) { |
2826 | spin_unlock(lock: &svms->deferred_list_lock); |
2827 | return false; |
2828 | } |
2829 | spin_unlock(lock: &svms->deferred_list_lock); |
2830 | |
2831 | if (prange->work_item.op == SVM_OP_UNMAP_RANGE) { |
2832 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] unmapped\n" , |
2833 | svms, prange, prange->start, prange->last); |
2834 | return true; |
2835 | } |
2836 | if (prange->work_item.op == SVM_OP_ADD_RANGE_AND_MAP || |
2837 | prange->work_item.op == SVM_OP_ADD_RANGE) { |
2838 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] not added yet\n" , |
2839 | svms, prange, prange->start, prange->last); |
2840 | return true; |
2841 | } |
2842 | return false; |
2843 | } |
2844 | |
2845 | static void |
2846 | svm_range_count_fault(struct kfd_node *node, struct kfd_process *p, |
2847 | int32_t gpuidx) |
2848 | { |
2849 | struct kfd_process_device *pdd; |
2850 | |
2851 | /* fault is on different page of same range |
2852 | * or fault is skipped to recover later |
2853 | * or fault is on invalid virtual address |
2854 | */ |
2855 | if (gpuidx == MAX_GPU_INSTANCE) { |
2856 | uint32_t gpuid; |
2857 | int r; |
2858 | |
2859 | r = kfd_process_gpuid_from_node(p, node, gpuid: &gpuid, gpuidx: &gpuidx); |
2860 | if (r < 0) |
2861 | return; |
2862 | } |
2863 | |
2864 | /* fault is recovered |
2865 | * or fault cannot recover because GPU no access on the range |
2866 | */ |
2867 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
2868 | if (pdd) |
2869 | WRITE_ONCE(pdd->faults, pdd->faults + 1); |
2870 | } |
2871 | |
2872 | static bool |
2873 | svm_fault_allowed(struct vm_area_struct *vma, bool write_fault) |
2874 | { |
2875 | unsigned long requested = VM_READ; |
2876 | |
2877 | if (write_fault) |
2878 | requested |= VM_WRITE; |
2879 | |
2880 | pr_debug("requested 0x%lx, vma permission flags 0x%lx\n" , requested, |
2881 | vma->vm_flags); |
2882 | return (vma->vm_flags & requested) == requested; |
2883 | } |
2884 | |
2885 | int |
2886 | svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, |
2887 | uint32_t vmid, uint32_t node_id, |
2888 | uint64_t addr, bool write_fault) |
2889 | { |
2890 | unsigned long start, last, size; |
2891 | struct mm_struct *mm = NULL; |
2892 | struct svm_range_list *svms; |
2893 | struct svm_range *prange; |
2894 | struct kfd_process *p; |
2895 | ktime_t timestamp = ktime_get_boottime(); |
2896 | struct kfd_node *node; |
2897 | int32_t best_loc; |
2898 | int32_t gpuidx = MAX_GPU_INSTANCE; |
2899 | bool write_locked = false; |
2900 | struct vm_area_struct *vma; |
2901 | bool migration = false; |
2902 | int r = 0; |
2903 | |
2904 | if (!KFD_IS_SVM_API_SUPPORTED(adev)) { |
2905 | pr_debug("device does not support SVM\n" ); |
2906 | return -EFAULT; |
2907 | } |
2908 | |
2909 | p = kfd_lookup_process_by_pasid(pasid); |
2910 | if (!p) { |
2911 | pr_debug("kfd process not founded pasid 0x%x\n" , pasid); |
2912 | return 0; |
2913 | } |
2914 | svms = &p->svms; |
2915 | |
2916 | pr_debug("restoring svms 0x%p fault address 0x%llx\n" , svms, addr); |
2917 | |
2918 | if (atomic_read(v: &svms->drain_pagefaults)) { |
2919 | pr_debug("draining retry fault, drop fault 0x%llx\n" , addr); |
2920 | r = 0; |
2921 | goto out; |
2922 | } |
2923 | |
2924 | if (!p->xnack_enabled) { |
2925 | pr_debug("XNACK not enabled for pasid 0x%x\n" , pasid); |
2926 | r = -EFAULT; |
2927 | goto out; |
2928 | } |
2929 | |
2930 | /* p->lead_thread is available as kfd_process_wq_release flush the work |
2931 | * before releasing task ref. |
2932 | */ |
2933 | mm = get_task_mm(task: p->lead_thread); |
2934 | if (!mm) { |
2935 | pr_debug("svms 0x%p failed to get mm\n" , svms); |
2936 | r = 0; |
2937 | goto out; |
2938 | } |
2939 | |
2940 | node = kfd_node_by_irq_ids(adev, node_id, vmid); |
2941 | if (!node) { |
2942 | pr_debug("kfd node does not exist node_id: %d, vmid: %d\n" , node_id, |
2943 | vmid); |
2944 | r = -EFAULT; |
2945 | goto out; |
2946 | } |
2947 | mmap_read_lock(mm); |
2948 | retry_write_locked: |
2949 | mutex_lock(&svms->lock); |
2950 | prange = svm_range_from_addr(svms, addr, NULL); |
2951 | if (!prange) { |
2952 | pr_debug("failed to find prange svms 0x%p address [0x%llx]\n" , |
2953 | svms, addr); |
2954 | if (!write_locked) { |
2955 | /* Need the write lock to create new range with MMU notifier. |
2956 | * Also flush pending deferred work to make sure the interval |
2957 | * tree is up to date before we add a new range |
2958 | */ |
2959 | mutex_unlock(lock: &svms->lock); |
2960 | mmap_read_unlock(mm); |
2961 | mmap_write_lock(mm); |
2962 | write_locked = true; |
2963 | goto retry_write_locked; |
2964 | } |
2965 | prange = svm_range_create_unregistered_range(node, p, mm, addr); |
2966 | if (!prange) { |
2967 | pr_debug("failed to create unregistered range svms 0x%p address [0x%llx]\n" , |
2968 | svms, addr); |
2969 | mmap_write_downgrade(mm); |
2970 | r = -EFAULT; |
2971 | goto out_unlock_svms; |
2972 | } |
2973 | } |
2974 | if (write_locked) |
2975 | mmap_write_downgrade(mm); |
2976 | |
2977 | mutex_lock(&prange->migrate_mutex); |
2978 | |
2979 | if (svm_range_skip_recover(prange)) { |
2980 | amdgpu_gmc_filter_faults_remove(adev: node->adev, addr, pasid); |
2981 | r = 0; |
2982 | goto out_unlock_range; |
2983 | } |
2984 | |
2985 | /* skip duplicate vm fault on different pages of same range */ |
2986 | if (ktime_before(cmp1: timestamp, ktime_add_ns(prange->validate_timestamp, |
2987 | AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) { |
2988 | pr_debug("svms 0x%p [0x%lx %lx] already restored\n" , |
2989 | svms, prange->start, prange->last); |
2990 | r = 0; |
2991 | goto out_unlock_range; |
2992 | } |
2993 | |
2994 | /* __do_munmap removed VMA, return success as we are handling stale |
2995 | * retry fault. |
2996 | */ |
2997 | vma = vma_lookup(mm, addr: addr << PAGE_SHIFT); |
2998 | if (!vma) { |
2999 | pr_debug("address 0x%llx VMA is removed\n" , addr); |
3000 | r = 0; |
3001 | goto out_unlock_range; |
3002 | } |
3003 | |
3004 | if (!svm_fault_allowed(vma, write_fault)) { |
3005 | pr_debug("fault addr 0x%llx no %s permission\n" , addr, |
3006 | write_fault ? "write" : "read" ); |
3007 | r = -EPERM; |
3008 | goto out_unlock_range; |
3009 | } |
3010 | |
3011 | best_loc = svm_range_best_restore_location(prange, node, gpuidx: &gpuidx); |
3012 | if (best_loc == -1) { |
3013 | pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n" , |
3014 | svms, prange->start, prange->last); |
3015 | r = -EACCES; |
3016 | goto out_unlock_range; |
3017 | } |
3018 | |
3019 | pr_debug("svms %p [0x%lx 0x%lx] best restore 0x%x, actual loc 0x%x\n" , |
3020 | svms, prange->start, prange->last, best_loc, |
3021 | prange->actual_loc); |
3022 | |
3023 | kfd_smi_event_page_fault_start(node, pid: p->lead_thread->pid, address: addr, |
3024 | write_fault, ts: timestamp); |
3025 | |
3026 | /* Align migration range start and size to granularity size */ |
3027 | size = 1UL << prange->granularity; |
3028 | start = max_t(unsigned long, ALIGN_DOWN(addr, size), prange->start); |
3029 | last = min_t(unsigned long, ALIGN(addr + 1, size) - 1, prange->last); |
3030 | if (prange->actual_loc != 0 || best_loc != 0) { |
3031 | migration = true; |
3032 | |
3033 | if (best_loc) { |
3034 | r = svm_migrate_to_vram(prange, best_loc, start, last, |
3035 | mm, trigger: KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); |
3036 | if (r) { |
3037 | pr_debug("svm_migrate_to_vram failed (%d) at %llx, falling back to system memory\n" , |
3038 | r, addr); |
3039 | /* Fallback to system memory if migration to |
3040 | * VRAM failed |
3041 | */ |
3042 | if (prange->actual_loc && prange->actual_loc != best_loc) |
3043 | r = svm_migrate_vram_to_ram(prange, mm, start, last, |
3044 | trigger: KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL); |
3045 | else |
3046 | r = 0; |
3047 | } |
3048 | } else { |
3049 | r = svm_migrate_vram_to_ram(prange, mm, start, last, |
3050 | trigger: KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, NULL); |
3051 | } |
3052 | if (r) { |
3053 | pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n" , |
3054 | r, svms, start, last); |
3055 | goto out_unlock_range; |
3056 | } |
3057 | } |
3058 | |
3059 | r = svm_range_validate_and_map(mm, map_start: start, map_last: last, prange, gpuidx, intr: false, |
3060 | wait: false, flush_tlb: false); |
3061 | if (r) |
3062 | pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n" , |
3063 | r, svms, start, last); |
3064 | |
3065 | kfd_smi_event_page_fault_end(node, pid: p->lead_thread->pid, address: addr, |
3066 | migration); |
3067 | |
3068 | out_unlock_range: |
3069 | mutex_unlock(lock: &prange->migrate_mutex); |
3070 | out_unlock_svms: |
3071 | mutex_unlock(lock: &svms->lock); |
3072 | mmap_read_unlock(mm); |
3073 | |
3074 | svm_range_count_fault(node, p, gpuidx); |
3075 | |
3076 | mmput(mm); |
3077 | out: |
3078 | kfd_unref_process(p); |
3079 | |
3080 | if (r == -EAGAIN) { |
3081 | pr_debug("recover vm fault later\n" ); |
3082 | amdgpu_gmc_filter_faults_remove(adev: node->adev, addr, pasid); |
3083 | r = 0; |
3084 | } |
3085 | return r; |
3086 | } |
3087 | |
3088 | int |
3089 | svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) |
3090 | { |
3091 | struct svm_range *prange, *pchild; |
3092 | uint64_t reserved_size = 0; |
3093 | uint64_t size; |
3094 | int r = 0; |
3095 | |
3096 | pr_debug("switching xnack from %d to %d\n" , p->xnack_enabled, xnack_enabled); |
3097 | |
3098 | mutex_lock(&p->svms.lock); |
3099 | |
3100 | list_for_each_entry(prange, &p->svms.list, list) { |
3101 | svm_range_lock(prange); |
3102 | list_for_each_entry(pchild, &prange->child_list, child_list) { |
3103 | size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; |
3104 | if (xnack_enabled) { |
3105 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, |
3106 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
3107 | } else { |
3108 | r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, |
3109 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
3110 | if (r) |
3111 | goto out_unlock; |
3112 | reserved_size += size; |
3113 | } |
3114 | } |
3115 | |
3116 | size = (prange->last - prange->start + 1) << PAGE_SHIFT; |
3117 | if (xnack_enabled) { |
3118 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size, |
3119 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
3120 | } else { |
3121 | r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, |
3122 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
3123 | if (r) |
3124 | goto out_unlock; |
3125 | reserved_size += size; |
3126 | } |
3127 | out_unlock: |
3128 | svm_range_unlock(prange); |
3129 | if (r) |
3130 | break; |
3131 | } |
3132 | |
3133 | if (r) |
3134 | amdgpu_amdkfd_unreserve_mem_limit(NULL, size: reserved_size, |
3135 | KFD_IOC_ALLOC_MEM_FLAGS_USERPTR, xcp_id: 0); |
3136 | else |
3137 | /* Change xnack mode must be inside svms lock, to avoid race with |
3138 | * svm_range_deferred_list_work unreserve memory in parallel. |
3139 | */ |
3140 | p->xnack_enabled = xnack_enabled; |
3141 | |
3142 | mutex_unlock(lock: &p->svms.lock); |
3143 | return r; |
3144 | } |
3145 | |
3146 | void svm_range_list_fini(struct kfd_process *p) |
3147 | { |
3148 | struct svm_range *prange; |
3149 | struct svm_range *next; |
3150 | |
3151 | pr_debug("pasid 0x%x svms 0x%p\n" , p->pasid, &p->svms); |
3152 | |
3153 | cancel_delayed_work_sync(dwork: &p->svms.restore_work); |
3154 | |
3155 | /* Ensure list work is finished before process is destroyed */ |
3156 | flush_work(work: &p->svms.deferred_list_work); |
3157 | |
3158 | /* |
3159 | * Ensure no retry fault comes in afterwards, as page fault handler will |
3160 | * not find kfd process and take mm lock to recover fault. |
3161 | */ |
3162 | atomic_inc(v: &p->svms.drain_pagefaults); |
3163 | svm_range_drain_retry_fault(svms: &p->svms); |
3164 | |
3165 | list_for_each_entry_safe(prange, next, &p->svms.list, list) { |
3166 | svm_range_unlink(prange); |
3167 | svm_range_remove_notifier(prange); |
3168 | svm_range_free(prange, do_unmap: true); |
3169 | } |
3170 | |
3171 | mutex_destroy(lock: &p->svms.lock); |
3172 | |
3173 | pr_debug("pasid 0x%x svms 0x%p done\n" , p->pasid, &p->svms); |
3174 | } |
3175 | |
3176 | int svm_range_list_init(struct kfd_process *p) |
3177 | { |
3178 | struct svm_range_list *svms = &p->svms; |
3179 | int i; |
3180 | |
3181 | svms->objects = RB_ROOT_CACHED; |
3182 | mutex_init(&svms->lock); |
3183 | INIT_LIST_HEAD(list: &svms->list); |
3184 | atomic_set(v: &svms->evicted_ranges, i: 0); |
3185 | atomic_set(v: &svms->drain_pagefaults, i: 0); |
3186 | INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work); |
3187 | INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work); |
3188 | INIT_LIST_HEAD(list: &svms->deferred_range_list); |
3189 | INIT_LIST_HEAD(list: &svms->criu_svm_metadata_list); |
3190 | spin_lock_init(&svms->deferred_list_lock); |
3191 | |
3192 | for (i = 0; i < p->n_pdds; i++) |
3193 | if (KFD_IS_SVM_API_SUPPORTED(p->pdds[i]->dev->adev)) |
3194 | bitmap_set(map: svms->bitmap_supported, start: i, nbits: 1); |
3195 | |
3196 | return 0; |
3197 | } |
3198 | |
3199 | /** |
3200 | * svm_range_check_vm - check if virtual address range mapped already |
3201 | * @p: current kfd_process |
3202 | * @start: range start address, in pages |
3203 | * @last: range last address, in pages |
3204 | * @bo_s: mapping start address in pages if address range already mapped |
3205 | * @bo_l: mapping last address in pages if address range already mapped |
3206 | * |
3207 | * The purpose is to avoid virtual address ranges already allocated by |
3208 | * kfd_ioctl_alloc_memory_of_gpu ioctl. |
3209 | * It looks for each pdd in the kfd_process. |
3210 | * |
3211 | * Context: Process context |
3212 | * |
3213 | * Return 0 - OK, if the range is not mapped. |
3214 | * Otherwise error code: |
3215 | * -EADDRINUSE - if address is mapped already by kfd_ioctl_alloc_memory_of_gpu |
3216 | * -ERESTARTSYS - A wait for the buffer to become unreserved was interrupted by |
3217 | * a signal. Release all buffer reservations and return to user-space. |
3218 | */ |
3219 | static int |
3220 | svm_range_check_vm(struct kfd_process *p, uint64_t start, uint64_t last, |
3221 | uint64_t *bo_s, uint64_t *bo_l) |
3222 | { |
3223 | struct amdgpu_bo_va_mapping *mapping; |
3224 | struct interval_tree_node *node; |
3225 | uint32_t i; |
3226 | int r; |
3227 | |
3228 | for (i = 0; i < p->n_pdds; i++) { |
3229 | struct amdgpu_vm *vm; |
3230 | |
3231 | if (!p->pdds[i]->drm_priv) |
3232 | continue; |
3233 | |
3234 | vm = drm_priv_to_vm(p->pdds[i]->drm_priv); |
3235 | r = amdgpu_bo_reserve(bo: vm->root.bo, no_intr: false); |
3236 | if (r) |
3237 | return r; |
3238 | |
3239 | node = interval_tree_iter_first(root: &vm->va, start, last); |
3240 | if (node) { |
3241 | pr_debug("range [0x%llx 0x%llx] already TTM mapped\n" , |
3242 | start, last); |
3243 | mapping = container_of((struct rb_node *)node, |
3244 | struct amdgpu_bo_va_mapping, rb); |
3245 | if (bo_s && bo_l) { |
3246 | *bo_s = mapping->start; |
3247 | *bo_l = mapping->last; |
3248 | } |
3249 | amdgpu_bo_unreserve(bo: vm->root.bo); |
3250 | return -EADDRINUSE; |
3251 | } |
3252 | amdgpu_bo_unreserve(bo: vm->root.bo); |
3253 | } |
3254 | |
3255 | return 0; |
3256 | } |
3257 | |
3258 | /** |
3259 | * svm_range_is_valid - check if virtual address range is valid |
3260 | * @p: current kfd_process |
3261 | * @start: range start address, in pages |
3262 | * @size: range size, in pages |
3263 | * |
3264 | * Valid virtual address range means it belongs to one or more VMAs |
3265 | * |
3266 | * Context: Process context |
3267 | * |
3268 | * Return: |
3269 | * 0 - OK, otherwise error code |
3270 | */ |
3271 | static int |
3272 | svm_range_is_valid(struct kfd_process *p, uint64_t start, uint64_t size) |
3273 | { |
3274 | const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP; |
3275 | struct vm_area_struct *vma; |
3276 | unsigned long end; |
3277 | unsigned long start_unchg = start; |
3278 | |
3279 | start <<= PAGE_SHIFT; |
3280 | end = start + (size << PAGE_SHIFT); |
3281 | do { |
3282 | vma = vma_lookup(mm: p->mm, addr: start); |
3283 | if (!vma || (vma->vm_flags & device_vma)) |
3284 | return -EFAULT; |
3285 | start = min(end, vma->vm_end); |
3286 | } while (start < end); |
3287 | |
3288 | return svm_range_check_vm(p, start: start_unchg, last: (end - 1) >> PAGE_SHIFT, NULL, |
3289 | NULL); |
3290 | } |
3291 | |
3292 | /** |
3293 | * svm_range_best_prefetch_location - decide the best prefetch location |
3294 | * @prange: svm range structure |
3295 | * |
3296 | * For xnack off: |
3297 | * If range map to single GPU, the best prefetch location is prefetch_loc, which |
3298 | * can be CPU or GPU. |
3299 | * |
3300 | * If range is ACCESS or ACCESS_IN_PLACE by mGPUs, only if mGPU connection on |
3301 | * XGMI same hive, the best prefetch location is prefetch_loc GPU, othervise |
3302 | * the best prefetch location is always CPU, because GPU can not have coherent |
3303 | * mapping VRAM of other GPUs even with large-BAR PCIe connection. |
3304 | * |
3305 | * For xnack on: |
3306 | * If range is not ACCESS_IN_PLACE by mGPUs, the best prefetch location is |
3307 | * prefetch_loc, other GPU access will generate vm fault and trigger migration. |
3308 | * |
3309 | * If range is ACCESS_IN_PLACE by mGPUs, only if mGPU connection on XGMI same |
3310 | * hive, the best prefetch location is prefetch_loc GPU, otherwise the best |
3311 | * prefetch location is always CPU. |
3312 | * |
3313 | * Context: Process context |
3314 | * |
3315 | * Return: |
3316 | * 0 for CPU or GPU id |
3317 | */ |
3318 | static uint32_t |
3319 | svm_range_best_prefetch_location(struct svm_range *prange) |
3320 | { |
3321 | DECLARE_BITMAP(bitmap, MAX_GPU_INSTANCE); |
3322 | uint32_t best_loc = prange->prefetch_loc; |
3323 | struct kfd_process_device *pdd; |
3324 | struct kfd_node *bo_node; |
3325 | struct kfd_process *p; |
3326 | uint32_t gpuidx; |
3327 | |
3328 | p = container_of(prange->svms, struct kfd_process, svms); |
3329 | |
3330 | if (!best_loc || best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) |
3331 | goto out; |
3332 | |
3333 | bo_node = svm_range_get_node_by_id(prange, gpu_id: best_loc); |
3334 | if (!bo_node) { |
3335 | WARN_ONCE(1, "failed to get valid kfd node at id%x\n" , best_loc); |
3336 | best_loc = 0; |
3337 | goto out; |
3338 | } |
3339 | |
3340 | if (bo_node->adev->gmc.is_app_apu) { |
3341 | best_loc = 0; |
3342 | goto out; |
3343 | } |
3344 | |
3345 | if (p->xnack_enabled) |
3346 | bitmap_copy(dst: bitmap, src: prange->bitmap_aip, MAX_GPU_INSTANCE); |
3347 | else |
3348 | bitmap_or(dst: bitmap, src1: prange->bitmap_access, src2: prange->bitmap_aip, |
3349 | MAX_GPU_INSTANCE); |
3350 | |
3351 | for_each_set_bit(gpuidx, bitmap, MAX_GPU_INSTANCE) { |
3352 | pdd = kfd_process_device_from_gpuidx(p, gpuidx); |
3353 | if (!pdd) { |
3354 | pr_debug("failed to get device by idx 0x%x\n" , gpuidx); |
3355 | continue; |
3356 | } |
3357 | |
3358 | if (pdd->dev->adev == bo_node->adev) |
3359 | continue; |
3360 | |
3361 | if (!svm_nodes_in_same_hive(node_a: pdd->dev, node_b: bo_node)) { |
3362 | best_loc = 0; |
3363 | break; |
3364 | } |
3365 | } |
3366 | |
3367 | out: |
3368 | pr_debug("xnack %d svms 0x%p [0x%lx 0x%lx] best loc 0x%x\n" , |
3369 | p->xnack_enabled, &p->svms, prange->start, prange->last, |
3370 | best_loc); |
3371 | |
3372 | return best_loc; |
3373 | } |
3374 | |
3375 | /* svm_range_trigger_migration - start page migration if prefetch loc changed |
3376 | * @mm: current process mm_struct |
3377 | * @prange: svm range structure |
3378 | * @migrated: output, true if migration is triggered |
3379 | * |
3380 | * If range perfetch_loc is GPU, actual loc is cpu 0, then migrate the range |
3381 | * from ram to vram. |
3382 | * If range prefetch_loc is cpu 0, actual loc is GPU, then migrate the range |
3383 | * from vram to ram. |
3384 | * |
3385 | * If GPU vm fault retry is not enabled, migration interact with MMU notifier |
3386 | * and restore work: |
3387 | * 1. migrate_vma_setup invalidate pages, MMU notifier callback svm_range_evict |
3388 | * stops all queues, schedule restore work |
3389 | * 2. svm_range_restore_work wait for migration is done by |
3390 | * a. svm_range_validate_vram takes prange->migrate_mutex |
3391 | * b. svm_range_validate_ram HMM get pages wait for CPU fault handle returns |
3392 | * 3. restore work update mappings of GPU, resume all queues. |
3393 | * |
3394 | * Context: Process context |
3395 | * |
3396 | * Return: |
3397 | * 0 - OK, otherwise - error code of migration |
3398 | */ |
3399 | static int |
3400 | svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, |
3401 | bool *migrated) |
3402 | { |
3403 | uint32_t best_loc; |
3404 | int r = 0; |
3405 | |
3406 | *migrated = false; |
3407 | best_loc = svm_range_best_prefetch_location(prange); |
3408 | |
3409 | /* when best_loc is a gpu node and same as prange->actual_loc |
3410 | * we still need do migration as prange->actual_loc !=0 does |
3411 | * not mean all pages in prange are vram. hmm migrate will pick |
3412 | * up right pages during migration. |
3413 | */ |
3414 | if ((best_loc == KFD_IOCTL_SVM_LOCATION_UNDEFINED) || |
3415 | (best_loc == 0 && prange->actual_loc == 0)) |
3416 | return 0; |
3417 | |
3418 | if (!best_loc) { |
3419 | r = svm_migrate_vram_to_ram(prange, mm, start: prange->start, last: prange->last, |
3420 | trigger: KFD_MIGRATE_TRIGGER_PREFETCH, NULL); |
3421 | *migrated = !r; |
3422 | return r; |
3423 | } |
3424 | |
3425 | r = svm_migrate_to_vram(prange, best_loc, start: prange->start, last: prange->last, |
3426 | mm, trigger: KFD_MIGRATE_TRIGGER_PREFETCH); |
3427 | *migrated = !r; |
3428 | |
3429 | return r; |
3430 | } |
3431 | |
3432 | int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence) |
3433 | { |
3434 | /* Dereferencing fence->svm_bo is safe here because the fence hasn't |
3435 | * signaled yet and we're under the protection of the fence->lock. |
3436 | * After the fence is signaled in svm_range_bo_release, we cannot get |
3437 | * here any more. |
3438 | * |
3439 | * Reference is dropped in svm_range_evict_svm_bo_worker. |
3440 | */ |
3441 | if (svm_bo_ref_unless_zero(svm_bo: fence->svm_bo)) { |
3442 | WRITE_ONCE(fence->svm_bo->evicting, 1); |
3443 | schedule_work(work: &fence->svm_bo->eviction_work); |
3444 | } |
3445 | |
3446 | return 0; |
3447 | } |
3448 | |
3449 | static void svm_range_evict_svm_bo_worker(struct work_struct *work) |
3450 | { |
3451 | struct svm_range_bo *svm_bo; |
3452 | struct mm_struct *mm; |
3453 | int r = 0; |
3454 | |
3455 | svm_bo = container_of(work, struct svm_range_bo, eviction_work); |
3456 | |
3457 | if (mmget_not_zero(mm: svm_bo->eviction_fence->mm)) { |
3458 | mm = svm_bo->eviction_fence->mm; |
3459 | } else { |
3460 | svm_range_bo_unref(svm_bo); |
3461 | return; |
3462 | } |
3463 | |
3464 | mmap_read_lock(mm); |
3465 | spin_lock(lock: &svm_bo->list_lock); |
3466 | while (!list_empty(head: &svm_bo->range_list) && !r) { |
3467 | struct svm_range *prange = |
3468 | list_first_entry(&svm_bo->range_list, |
3469 | struct svm_range, svm_bo_list); |
3470 | int retries = 3; |
3471 | |
3472 | list_del_init(entry: &prange->svm_bo_list); |
3473 | spin_unlock(lock: &svm_bo->list_lock); |
3474 | |
3475 | pr_debug("svms 0x%p [0x%lx 0x%lx]\n" , prange->svms, |
3476 | prange->start, prange->last); |
3477 | |
3478 | mutex_lock(&prange->migrate_mutex); |
3479 | do { |
3480 | /* migrate all vram pages in this prange to sys ram |
3481 | * after that prange->actual_loc should be zero |
3482 | */ |
3483 | r = svm_migrate_vram_to_ram(prange, mm, |
3484 | start: prange->start, last: prange->last, |
3485 | trigger: KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); |
3486 | } while (!r && prange->actual_loc && --retries); |
3487 | |
3488 | if (!r && prange->actual_loc) |
3489 | pr_info_once("Migration failed during eviction" ); |
3490 | |
3491 | if (!prange->actual_loc) { |
3492 | mutex_lock(&prange->lock); |
3493 | prange->svm_bo = NULL; |
3494 | mutex_unlock(lock: &prange->lock); |
3495 | } |
3496 | mutex_unlock(lock: &prange->migrate_mutex); |
3497 | |
3498 | spin_lock(lock: &svm_bo->list_lock); |
3499 | } |
3500 | spin_unlock(lock: &svm_bo->list_lock); |
3501 | mmap_read_unlock(mm); |
3502 | mmput(mm); |
3503 | |
3504 | dma_fence_signal(fence: &svm_bo->eviction_fence->base); |
3505 | |
3506 | /* This is the last reference to svm_bo, after svm_range_vram_node_free |
3507 | * has been called in svm_migrate_vram_to_ram |
3508 | */ |
3509 | WARN_ONCE(!r && kref_read(&svm_bo->kref) != 1, "This was not the last reference\n" ); |
3510 | svm_range_bo_unref(svm_bo); |
3511 | } |
3512 | |
3513 | static int |
3514 | svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm, |
3515 | uint64_t start, uint64_t size, uint32_t nattr, |
3516 | struct kfd_ioctl_svm_attribute *attrs) |
3517 | { |
3518 | struct amdkfd_process_info *process_info = p->kgd_process_info; |
3519 | struct list_head update_list; |
3520 | struct list_head insert_list; |
3521 | struct list_head remove_list; |
3522 | struct list_head remap_list; |
3523 | struct svm_range_list *svms; |
3524 | struct svm_range *prange; |
3525 | struct svm_range *next; |
3526 | bool update_mapping = false; |
3527 | bool flush_tlb; |
3528 | int r, ret = 0; |
3529 | |
3530 | pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n" , |
3531 | p->pasid, &p->svms, start, start + size - 1, size); |
3532 | |
3533 | r = svm_range_check_attr(p, nattr, attrs); |
3534 | if (r) |
3535 | return r; |
3536 | |
3537 | svms = &p->svms; |
3538 | |
3539 | mutex_lock(&process_info->lock); |
3540 | |
3541 | svm_range_list_lock_and_flush_work(svms, mm); |
3542 | |
3543 | r = svm_range_is_valid(p, start, size); |
3544 | if (r) { |
3545 | pr_debug("invalid range r=%d\n" , r); |
3546 | mmap_write_unlock(mm); |
3547 | goto out; |
3548 | } |
3549 | |
3550 | mutex_lock(&svms->lock); |
3551 | |
3552 | /* Add new range and split existing ranges as needed */ |
3553 | r = svm_range_add(p, start, size, nattr, attrs, update_list: &update_list, |
3554 | insert_list: &insert_list, remove_list: &remove_list, remap_list: &remap_list); |
3555 | if (r) { |
3556 | mutex_unlock(lock: &svms->lock); |
3557 | mmap_write_unlock(mm); |
3558 | goto out; |
3559 | } |
3560 | /* Apply changes as a transaction */ |
3561 | list_for_each_entry_safe(prange, next, &insert_list, list) { |
3562 | svm_range_add_to_svms(prange); |
3563 | svm_range_add_notifier_locked(mm, prange); |
3564 | } |
3565 | list_for_each_entry(prange, &update_list, update_list) { |
3566 | svm_range_apply_attrs(p, prange, nattr, attrs, update_mapping: &update_mapping); |
3567 | /* TODO: unmap ranges from GPU that lost access */ |
3568 | } |
3569 | list_for_each_entry_safe(prange, next, &remove_list, update_list) { |
3570 | pr_debug("unlink old 0x%p prange 0x%p [0x%lx 0x%lx]\n" , |
3571 | prange->svms, prange, prange->start, |
3572 | prange->last); |
3573 | svm_range_unlink(prange); |
3574 | svm_range_remove_notifier(prange); |
3575 | svm_range_free(prange, do_unmap: false); |
3576 | } |
3577 | |
3578 | mmap_write_downgrade(mm); |
3579 | /* Trigger migrations and revalidate and map to GPUs as needed. If |
3580 | * this fails we may be left with partially completed actions. There |
3581 | * is no clean way of rolling back to the previous state in such a |
3582 | * case because the rollback wouldn't be guaranteed to work either. |
3583 | */ |
3584 | list_for_each_entry(prange, &update_list, update_list) { |
3585 | bool migrated; |
3586 | |
3587 | mutex_lock(&prange->migrate_mutex); |
3588 | |
3589 | r = svm_range_trigger_migration(mm, prange, migrated: &migrated); |
3590 | if (r) |
3591 | goto out_unlock_range; |
3592 | |
3593 | if (migrated && (!p->xnack_enabled || |
3594 | (prange->flags & KFD_IOCTL_SVM_FLAG_GPU_ALWAYS_MAPPED)) && |
3595 | prange->mapped_to_gpu) { |
3596 | pr_debug("restore_work will update mappings of GPUs\n" ); |
3597 | mutex_unlock(lock: &prange->migrate_mutex); |
3598 | continue; |
3599 | } |
3600 | |
3601 | if (!migrated && !update_mapping) { |
3602 | mutex_unlock(lock: &prange->migrate_mutex); |
3603 | continue; |
3604 | } |
3605 | |
3606 | flush_tlb = !migrated && update_mapping && prange->mapped_to_gpu; |
3607 | |
3608 | r = svm_range_validate_and_map(mm, map_start: prange->start, map_last: prange->last, prange, |
3609 | MAX_GPU_INSTANCE, intr: true, wait: true, flush_tlb); |
3610 | if (r) |
3611 | pr_debug("failed %d to map svm range\n" , r); |
3612 | |
3613 | out_unlock_range: |
3614 | mutex_unlock(lock: &prange->migrate_mutex); |
3615 | if (r) |
3616 | ret = r; |
3617 | } |
3618 | |
3619 | list_for_each_entry(prange, &remap_list, update_list) { |
3620 | pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n" , |
3621 | prange, prange->start, prange->last); |
3622 | mutex_lock(&prange->migrate_mutex); |
3623 | r = svm_range_validate_and_map(mm, map_start: prange->start, map_last: prange->last, prange, |
3624 | MAX_GPU_INSTANCE, intr: true, wait: true, flush_tlb: prange->mapped_to_gpu); |
3625 | if (r) |
3626 | pr_debug("failed %d on remap svm range\n" , r); |
3627 | mutex_unlock(lock: &prange->migrate_mutex); |
3628 | if (r) |
3629 | ret = r; |
3630 | } |
3631 | |
3632 | dynamic_svm_range_dump(svms); |
3633 | |
3634 | mutex_unlock(lock: &svms->lock); |
3635 | mmap_read_unlock(mm); |
3636 | out: |
3637 | mutex_unlock(lock: &process_info->lock); |
3638 | |
3639 | pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n" , p->pasid, |
3640 | &p->svms, start, start + size - 1, r); |
3641 | |
3642 | return ret ? ret : r; |
3643 | } |
3644 | |
3645 | static int |
3646 | svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm, |
3647 | uint64_t start, uint64_t size, uint32_t nattr, |
3648 | struct kfd_ioctl_svm_attribute *attrs) |
3649 | { |
3650 | DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE); |
3651 | DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE); |
3652 | bool get_preferred_loc = false; |
3653 | bool get_prefetch_loc = false; |
3654 | bool get_granularity = false; |
3655 | bool get_accessible = false; |
3656 | bool get_flags = false; |
3657 | uint64_t last = start + size - 1UL; |
3658 | uint8_t granularity = 0xff; |
3659 | struct interval_tree_node *node; |
3660 | struct svm_range_list *svms; |
3661 | struct svm_range *prange; |
3662 | uint32_t prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
3663 | uint32_t location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
3664 | uint32_t flags_and = 0xffffffff; |
3665 | uint32_t flags_or = 0; |
3666 | int gpuidx; |
3667 | uint32_t i; |
3668 | int r = 0; |
3669 | |
3670 | pr_debug("svms 0x%p [0x%llx 0x%llx] nattr 0x%x\n" , &p->svms, start, |
3671 | start + size - 1, nattr); |
3672 | |
3673 | /* Flush pending deferred work to avoid racing with deferred actions from |
3674 | * previous memory map changes (e.g. munmap). Concurrent memory map changes |
3675 | * can still race with get_attr because we don't hold the mmap lock. But that |
3676 | * would be a race condition in the application anyway, and undefined |
3677 | * behaviour is acceptable in that case. |
3678 | */ |
3679 | flush_work(work: &p->svms.deferred_list_work); |
3680 | |
3681 | mmap_read_lock(mm); |
3682 | r = svm_range_is_valid(p, start, size); |
3683 | mmap_read_unlock(mm); |
3684 | if (r) { |
3685 | pr_debug("invalid range r=%d\n" , r); |
3686 | return r; |
3687 | } |
3688 | |
3689 | for (i = 0; i < nattr; i++) { |
3690 | switch (attrs[i].type) { |
3691 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: |
3692 | get_preferred_loc = true; |
3693 | break; |
3694 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
3695 | get_prefetch_loc = true; |
3696 | break; |
3697 | case KFD_IOCTL_SVM_ATTR_ACCESS: |
3698 | get_accessible = true; |
3699 | break; |
3700 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
3701 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
3702 | get_flags = true; |
3703 | break; |
3704 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: |
3705 | get_granularity = true; |
3706 | break; |
3707 | case KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE: |
3708 | case KFD_IOCTL_SVM_ATTR_NO_ACCESS: |
3709 | fallthrough; |
3710 | default: |
3711 | pr_debug("get invalid attr type 0x%x\n" , attrs[i].type); |
3712 | return -EINVAL; |
3713 | } |
3714 | } |
3715 | |
3716 | svms = &p->svms; |
3717 | |
3718 | mutex_lock(&svms->lock); |
3719 | |
3720 | node = interval_tree_iter_first(root: &svms->objects, start, last); |
3721 | if (!node) { |
3722 | pr_debug("range attrs not found return default values\n" ); |
3723 | svm_range_set_default_attributes(location: &location, prefetch_loc: &prefetch_loc, |
3724 | granularity: &granularity, flags: &flags_and); |
3725 | flags_or = flags_and; |
3726 | if (p->xnack_enabled) |
3727 | bitmap_copy(dst: bitmap_access, src: svms->bitmap_supported, |
3728 | MAX_GPU_INSTANCE); |
3729 | else |
3730 | bitmap_zero(dst: bitmap_access, MAX_GPU_INSTANCE); |
3731 | bitmap_zero(dst: bitmap_aip, MAX_GPU_INSTANCE); |
3732 | goto fill_values; |
3733 | } |
3734 | bitmap_copy(dst: bitmap_access, src: svms->bitmap_supported, MAX_GPU_INSTANCE); |
3735 | bitmap_copy(dst: bitmap_aip, src: svms->bitmap_supported, MAX_GPU_INSTANCE); |
3736 | |
3737 | while (node) { |
3738 | struct interval_tree_node *next; |
3739 | |
3740 | prange = container_of(node, struct svm_range, it_node); |
3741 | next = interval_tree_iter_next(node, start, last); |
3742 | |
3743 | if (get_preferred_loc) { |
3744 | if (prange->preferred_loc == |
3745 | KFD_IOCTL_SVM_LOCATION_UNDEFINED || |
3746 | (location != KFD_IOCTL_SVM_LOCATION_UNDEFINED && |
3747 | location != prange->preferred_loc)) { |
3748 | location = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
3749 | get_preferred_loc = false; |
3750 | } else { |
3751 | location = prange->preferred_loc; |
3752 | } |
3753 | } |
3754 | if (get_prefetch_loc) { |
3755 | if (prange->prefetch_loc == |
3756 | KFD_IOCTL_SVM_LOCATION_UNDEFINED || |
3757 | (prefetch_loc != KFD_IOCTL_SVM_LOCATION_UNDEFINED && |
3758 | prefetch_loc != prange->prefetch_loc)) { |
3759 | prefetch_loc = KFD_IOCTL_SVM_LOCATION_UNDEFINED; |
3760 | get_prefetch_loc = false; |
3761 | } else { |
3762 | prefetch_loc = prange->prefetch_loc; |
3763 | } |
3764 | } |
3765 | if (get_accessible) { |
3766 | bitmap_and(dst: bitmap_access, src1: bitmap_access, |
3767 | src2: prange->bitmap_access, MAX_GPU_INSTANCE); |
3768 | bitmap_and(dst: bitmap_aip, src1: bitmap_aip, |
3769 | src2: prange->bitmap_aip, MAX_GPU_INSTANCE); |
3770 | } |
3771 | if (get_flags) { |
3772 | flags_and &= prange->flags; |
3773 | flags_or |= prange->flags; |
3774 | } |
3775 | |
3776 | if (get_granularity && prange->granularity < granularity) |
3777 | granularity = prange->granularity; |
3778 | |
3779 | node = next; |
3780 | } |
3781 | fill_values: |
3782 | mutex_unlock(lock: &svms->lock); |
3783 | |
3784 | for (i = 0; i < nattr; i++) { |
3785 | switch (attrs[i].type) { |
3786 | case KFD_IOCTL_SVM_ATTR_PREFERRED_LOC: |
3787 | attrs[i].value = location; |
3788 | break; |
3789 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
3790 | attrs[i].value = prefetch_loc; |
3791 | break; |
3792 | case KFD_IOCTL_SVM_ATTR_ACCESS: |
3793 | gpuidx = kfd_process_gpuidx_from_gpuid(p, |
3794 | gpu_id: attrs[i].value); |
3795 | if (gpuidx < 0) { |
3796 | pr_debug("invalid gpuid %x\n" , attrs[i].value); |
3797 | return -EINVAL; |
3798 | } |
3799 | if (test_bit(gpuidx, bitmap_access)) |
3800 | attrs[i].type = KFD_IOCTL_SVM_ATTR_ACCESS; |
3801 | else if (test_bit(gpuidx, bitmap_aip)) |
3802 | attrs[i].type = |
3803 | KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE; |
3804 | else |
3805 | attrs[i].type = KFD_IOCTL_SVM_ATTR_NO_ACCESS; |
3806 | break; |
3807 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
3808 | attrs[i].value = flags_and; |
3809 | break; |
3810 | case KFD_IOCTL_SVM_ATTR_CLR_FLAGS: |
3811 | attrs[i].value = ~flags_or; |
3812 | break; |
3813 | case KFD_IOCTL_SVM_ATTR_GRANULARITY: |
3814 | attrs[i].value = (uint32_t)granularity; |
3815 | break; |
3816 | } |
3817 | } |
3818 | |
3819 | return 0; |
3820 | } |
3821 | |
3822 | int kfd_criu_resume_svm(struct kfd_process *p) |
3823 | { |
3824 | struct kfd_ioctl_svm_attribute *set_attr_new, *set_attr = NULL; |
3825 | int nattr_common = 4, nattr_accessibility = 1; |
3826 | struct criu_svm_metadata *criu_svm_md = NULL; |
3827 | struct svm_range_list *svms = &p->svms; |
3828 | struct criu_svm_metadata *next = NULL; |
3829 | uint32_t set_flags = 0xffffffff; |
3830 | int i, j, num_attrs, ret = 0; |
3831 | uint64_t set_attr_size; |
3832 | struct mm_struct *mm; |
3833 | |
3834 | if (list_empty(head: &svms->criu_svm_metadata_list)) { |
3835 | pr_debug("No SVM data from CRIU restore stage 2\n" ); |
3836 | return ret; |
3837 | } |
3838 | |
3839 | mm = get_task_mm(task: p->lead_thread); |
3840 | if (!mm) { |
3841 | pr_err("failed to get mm for the target process\n" ); |
3842 | return -ESRCH; |
3843 | } |
3844 | |
3845 | num_attrs = nattr_common + (nattr_accessibility * p->n_pdds); |
3846 | |
3847 | i = j = 0; |
3848 | list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) { |
3849 | pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx (npages)\n" , |
3850 | i, criu_svm_md->data.start_addr, criu_svm_md->data.size); |
3851 | |
3852 | for (j = 0; j < num_attrs; j++) { |
3853 | pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n" , |
3854 | i, j, criu_svm_md->data.attrs[j].type, |
3855 | i, j, criu_svm_md->data.attrs[j].value); |
3856 | switch (criu_svm_md->data.attrs[j].type) { |
3857 | /* During Checkpoint operation, the query for |
3858 | * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC attribute might |
3859 | * return KFD_IOCTL_SVM_LOCATION_UNDEFINED if they were |
3860 | * not used by the range which was checkpointed. Care |
3861 | * must be taken to not restore with an invalid value |
3862 | * otherwise the gpuidx value will be invalid and |
3863 | * set_attr would eventually fail so just replace those |
3864 | * with another dummy attribute such as |
3865 | * KFD_IOCTL_SVM_ATTR_SET_FLAGS. |
3866 | */ |
3867 | case KFD_IOCTL_SVM_ATTR_PREFETCH_LOC: |
3868 | if (criu_svm_md->data.attrs[j].value == |
3869 | KFD_IOCTL_SVM_LOCATION_UNDEFINED) { |
3870 | criu_svm_md->data.attrs[j].type = |
3871 | KFD_IOCTL_SVM_ATTR_SET_FLAGS; |
3872 | criu_svm_md->data.attrs[j].value = 0; |
3873 | } |
3874 | break; |
3875 | case KFD_IOCTL_SVM_ATTR_SET_FLAGS: |
3876 | set_flags = criu_svm_md->data.attrs[j].value; |
3877 | break; |
3878 | default: |
3879 | break; |
3880 | } |
3881 | } |
3882 | |
3883 | /* CLR_FLAGS is not available via get_attr during checkpoint but |
3884 | * it needs to be inserted before restoring the ranges so |
3885 | * allocate extra space for it before calling set_attr |
3886 | */ |
3887 | set_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * |
3888 | (num_attrs + 1); |
3889 | set_attr_new = krealloc(objp: set_attr, new_size: set_attr_size, |
3890 | GFP_KERNEL); |
3891 | if (!set_attr_new) { |
3892 | ret = -ENOMEM; |
3893 | goto exit; |
3894 | } |
3895 | set_attr = set_attr_new; |
3896 | |
3897 | memcpy(set_attr, criu_svm_md->data.attrs, num_attrs * |
3898 | sizeof(struct kfd_ioctl_svm_attribute)); |
3899 | set_attr[num_attrs].type = KFD_IOCTL_SVM_ATTR_CLR_FLAGS; |
3900 | set_attr[num_attrs].value = ~set_flags; |
3901 | |
3902 | ret = svm_range_set_attr(p, mm, start: criu_svm_md->data.start_addr, |
3903 | size: criu_svm_md->data.size, nattr: num_attrs + 1, |
3904 | attrs: set_attr); |
3905 | if (ret) { |
3906 | pr_err("CRIU: failed to set range attributes\n" ); |
3907 | goto exit; |
3908 | } |
3909 | |
3910 | i++; |
3911 | } |
3912 | exit: |
3913 | kfree(objp: set_attr); |
3914 | list_for_each_entry_safe(criu_svm_md, next, &svms->criu_svm_metadata_list, list) { |
3915 | pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n" , |
3916 | criu_svm_md->data.start_addr); |
3917 | kfree(objp: criu_svm_md); |
3918 | } |
3919 | |
3920 | mmput(mm); |
3921 | return ret; |
3922 | |
3923 | } |
3924 | |
3925 | int kfd_criu_restore_svm(struct kfd_process *p, |
3926 | uint8_t __user *user_priv_ptr, |
3927 | uint64_t *priv_data_offset, |
3928 | uint64_t max_priv_data_size) |
3929 | { |
3930 | uint64_t svm_priv_data_size, svm_object_md_size, svm_attrs_size; |
3931 | int nattr_common = 4, nattr_accessibility = 1; |
3932 | struct criu_svm_metadata *criu_svm_md = NULL; |
3933 | struct svm_range_list *svms = &p->svms; |
3934 | uint32_t num_devices; |
3935 | int ret = 0; |
3936 | |
3937 | num_devices = p->n_pdds; |
3938 | /* Handle one SVM range object at a time, also the number of gpus are |
3939 | * assumed to be same on the restore node, checking must be done while |
3940 | * evaluating the topology earlier |
3941 | */ |
3942 | |
3943 | svm_attrs_size = sizeof(struct kfd_ioctl_svm_attribute) * |
3944 | (nattr_common + nattr_accessibility * num_devices); |
3945 | svm_object_md_size = sizeof(struct criu_svm_metadata) + svm_attrs_size; |
3946 | |
3947 | svm_priv_data_size = sizeof(struct kfd_criu_svm_range_priv_data) + |
3948 | svm_attrs_size; |
3949 | |
3950 | criu_svm_md = kzalloc(size: svm_object_md_size, GFP_KERNEL); |
3951 | if (!criu_svm_md) { |
3952 | pr_err("failed to allocate memory to store svm metadata\n" ); |
3953 | return -ENOMEM; |
3954 | } |
3955 | if (*priv_data_offset + svm_priv_data_size > max_priv_data_size) { |
3956 | ret = -EINVAL; |
3957 | goto exit; |
3958 | } |
3959 | |
3960 | ret = copy_from_user(to: &criu_svm_md->data, from: user_priv_ptr + *priv_data_offset, |
3961 | n: svm_priv_data_size); |
3962 | if (ret) { |
3963 | ret = -EFAULT; |
3964 | goto exit; |
3965 | } |
3966 | *priv_data_offset += svm_priv_data_size; |
3967 | |
3968 | list_add_tail(new: &criu_svm_md->list, head: &svms->criu_svm_metadata_list); |
3969 | |
3970 | return 0; |
3971 | |
3972 | |
3973 | exit: |
3974 | kfree(objp: criu_svm_md); |
3975 | return ret; |
3976 | } |
3977 | |
3978 | int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, |
3979 | uint64_t *svm_priv_data_size) |
3980 | { |
3981 | uint64_t total_size, accessibility_size, common_attr_size; |
3982 | int nattr_common = 4, nattr_accessibility = 1; |
3983 | int num_devices = p->n_pdds; |
3984 | struct svm_range_list *svms; |
3985 | struct svm_range *prange; |
3986 | uint32_t count = 0; |
3987 | |
3988 | *svm_priv_data_size = 0; |
3989 | |
3990 | svms = &p->svms; |
3991 | if (!svms) |
3992 | return -EINVAL; |
3993 | |
3994 | mutex_lock(&svms->lock); |
3995 | list_for_each_entry(prange, &svms->list, list) { |
3996 | pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n" , |
3997 | prange, prange->start, prange->npages, |
3998 | prange->start + prange->npages - 1); |
3999 | count++; |
4000 | } |
4001 | mutex_unlock(lock: &svms->lock); |
4002 | |
4003 | *num_svm_ranges = count; |
4004 | /* Only the accessbility attributes need to be queried for all the gpus |
4005 | * individually, remaining ones are spanned across the entire process |
4006 | * regardless of the various gpu nodes. Of the remaining attributes, |
4007 | * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved. |
4008 | * |
4009 | * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC |
4010 | * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC |
4011 | * KFD_IOCTL_SVM_ATTR_SET_FLAGS |
4012 | * KFD_IOCTL_SVM_ATTR_GRANULARITY |
4013 | * |
4014 | * ** ACCESSBILITY ATTRIBUTES ** |
4015 | * (Considered as one, type is altered during query, value is gpuid) |
4016 | * KFD_IOCTL_SVM_ATTR_ACCESS |
4017 | * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE |
4018 | * KFD_IOCTL_SVM_ATTR_NO_ACCESS |
4019 | */ |
4020 | if (*num_svm_ranges > 0) { |
4021 | common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * |
4022 | nattr_common; |
4023 | accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) * |
4024 | nattr_accessibility * num_devices; |
4025 | |
4026 | total_size = sizeof(struct kfd_criu_svm_range_priv_data) + |
4027 | common_attr_size + accessibility_size; |
4028 | |
4029 | *svm_priv_data_size = *num_svm_ranges * total_size; |
4030 | } |
4031 | |
4032 | pr_debug("num_svm_ranges %u total_priv_size %llu\n" , *num_svm_ranges, |
4033 | *svm_priv_data_size); |
4034 | return 0; |
4035 | } |
4036 | |
4037 | int kfd_criu_checkpoint_svm(struct kfd_process *p, |
4038 | uint8_t __user *user_priv_data, |
4039 | uint64_t *priv_data_offset) |
4040 | { |
4041 | struct kfd_criu_svm_range_priv_data *svm_priv = NULL; |
4042 | struct kfd_ioctl_svm_attribute *query_attr = NULL; |
4043 | uint64_t svm_priv_data_size, query_attr_size = 0; |
4044 | int index, nattr_common = 4, ret = 0; |
4045 | struct svm_range_list *svms; |
4046 | int num_devices = p->n_pdds; |
4047 | struct svm_range *prange; |
4048 | struct mm_struct *mm; |
4049 | |
4050 | svms = &p->svms; |
4051 | if (!svms) |
4052 | return -EINVAL; |
4053 | |
4054 | mm = get_task_mm(task: p->lead_thread); |
4055 | if (!mm) { |
4056 | pr_err("failed to get mm for the target process\n" ); |
4057 | return -ESRCH; |
4058 | } |
4059 | |
4060 | query_attr_size = sizeof(struct kfd_ioctl_svm_attribute) * |
4061 | (nattr_common + num_devices); |
4062 | |
4063 | query_attr = kzalloc(size: query_attr_size, GFP_KERNEL); |
4064 | if (!query_attr) { |
4065 | ret = -ENOMEM; |
4066 | goto exit; |
4067 | } |
4068 | |
4069 | query_attr[0].type = KFD_IOCTL_SVM_ATTR_PREFERRED_LOC; |
4070 | query_attr[1].type = KFD_IOCTL_SVM_ATTR_PREFETCH_LOC; |
4071 | query_attr[2].type = KFD_IOCTL_SVM_ATTR_SET_FLAGS; |
4072 | query_attr[3].type = KFD_IOCTL_SVM_ATTR_GRANULARITY; |
4073 | |
4074 | for (index = 0; index < num_devices; index++) { |
4075 | struct kfd_process_device *pdd = p->pdds[index]; |
4076 | |
4077 | query_attr[index + nattr_common].type = |
4078 | KFD_IOCTL_SVM_ATTR_ACCESS; |
4079 | query_attr[index + nattr_common].value = pdd->user_gpu_id; |
4080 | } |
4081 | |
4082 | svm_priv_data_size = sizeof(*svm_priv) + query_attr_size; |
4083 | |
4084 | svm_priv = kzalloc(size: svm_priv_data_size, GFP_KERNEL); |
4085 | if (!svm_priv) { |
4086 | ret = -ENOMEM; |
4087 | goto exit_query; |
4088 | } |
4089 | |
4090 | index = 0; |
4091 | list_for_each_entry(prange, &svms->list, list) { |
4092 | |
4093 | svm_priv->object_type = KFD_CRIU_OBJECT_TYPE_SVM_RANGE; |
4094 | svm_priv->start_addr = prange->start; |
4095 | svm_priv->size = prange->npages; |
4096 | memcpy(&svm_priv->attrs, query_attr, query_attr_size); |
4097 | pr_debug("CRIU: prange: 0x%p start: 0x%lx\t npages: 0x%llx end: 0x%llx\t size: 0x%llx\n" , |
4098 | prange, prange->start, prange->npages, |
4099 | prange->start + prange->npages - 1, |
4100 | prange->npages * PAGE_SIZE); |
4101 | |
4102 | ret = svm_range_get_attr(p, mm, start: svm_priv->start_addr, |
4103 | size: svm_priv->size, |
4104 | nattr: (nattr_common + num_devices), |
4105 | attrs: svm_priv->attrs); |
4106 | if (ret) { |
4107 | pr_err("CRIU: failed to obtain range attributes\n" ); |
4108 | goto exit_priv; |
4109 | } |
4110 | |
4111 | if (copy_to_user(to: user_priv_data + *priv_data_offset, from: svm_priv, |
4112 | n: svm_priv_data_size)) { |
4113 | pr_err("Failed to copy svm priv to user\n" ); |
4114 | ret = -EFAULT; |
4115 | goto exit_priv; |
4116 | } |
4117 | |
4118 | *priv_data_offset += svm_priv_data_size; |
4119 | |
4120 | } |
4121 | |
4122 | |
4123 | exit_priv: |
4124 | kfree(objp: svm_priv); |
4125 | exit_query: |
4126 | kfree(objp: query_attr); |
4127 | exit: |
4128 | mmput(mm); |
4129 | return ret; |
4130 | } |
4131 | |
4132 | int |
4133 | svm_ioctl(struct kfd_process *p, enum kfd_ioctl_svm_op op, uint64_t start, |
4134 | uint64_t size, uint32_t nattrs, struct kfd_ioctl_svm_attribute *attrs) |
4135 | { |
4136 | struct mm_struct *mm = current->mm; |
4137 | int r; |
4138 | |
4139 | start >>= PAGE_SHIFT; |
4140 | size >>= PAGE_SHIFT; |
4141 | |
4142 | switch (op) { |
4143 | case KFD_IOCTL_SVM_OP_SET_ATTR: |
4144 | r = svm_range_set_attr(p, mm, start, size, nattr: nattrs, attrs); |
4145 | break; |
4146 | case KFD_IOCTL_SVM_OP_GET_ATTR: |
4147 | r = svm_range_get_attr(p, mm, start, size, nattr: nattrs, attrs); |
4148 | break; |
4149 | default: |
4150 | r = EINVAL; |
4151 | break; |
4152 | } |
4153 | |
4154 | return r; |
4155 | } |
4156 | |