1 | // SPDX-License-Identifier: GPL-2.0 OR MIT |
2 | /* |
3 | * Copyright 2020-2021 Advanced Micro Devices, Inc. |
4 | * |
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
6 | * copy of this software and associated documentation files (the "Software"), |
7 | * to deal in the Software without restriction, including without limitation |
8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
9 | * and/or sell copies of the Software, and to permit persons to whom the |
10 | * Software is furnished to do so, subject to the following conditions: |
11 | * |
12 | * The above copyright notice and this permission notice shall be included in |
13 | * all copies or substantial portions of the Software. |
14 | * |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
21 | * OTHER DEALINGS IN THE SOFTWARE. |
22 | */ |
23 | #include <linux/types.h> |
24 | #include <linux/hmm.h> |
25 | #include <linux/dma-direction.h> |
26 | #include <linux/dma-mapping.h> |
27 | #include <linux/migrate.h> |
28 | #include "amdgpu_sync.h" |
29 | #include "amdgpu_object.h" |
30 | #include "amdgpu_vm.h" |
31 | #include "amdgpu_res_cursor.h" |
32 | #include "kfd_priv.h" |
33 | #include "kfd_svm.h" |
34 | #include "kfd_migrate.h" |
35 | #include "kfd_smi_events.h" |
36 | |
37 | #ifdef dev_fmt |
38 | #undef dev_fmt |
39 | #endif |
40 | #define dev_fmt(fmt) "kfd_migrate: " fmt |
41 | |
42 | static uint64_t |
43 | svm_migrate_direct_mapping_addr(struct amdgpu_device *adev, uint64_t addr) |
44 | { |
45 | return addr + amdgpu_ttm_domain_start(adev, TTM_PL_VRAM); |
46 | } |
47 | |
48 | static int |
49 | svm_migrate_gart_map(struct amdgpu_ring *ring, uint64_t npages, |
50 | dma_addr_t *addr, uint64_t *gart_addr, uint64_t flags) |
51 | { |
52 | struct amdgpu_device *adev = ring->adev; |
53 | struct amdgpu_job *job; |
54 | unsigned int num_dw, num_bytes; |
55 | struct dma_fence *fence; |
56 | uint64_t src_addr, dst_addr; |
57 | uint64_t pte_flags; |
58 | void *cpu_addr; |
59 | int r; |
60 | |
61 | /* use gart window 0 */ |
62 | *gart_addr = adev->gmc.gart_start; |
63 | |
64 | num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); |
65 | num_bytes = npages * 8; |
66 | |
67 | r = amdgpu_job_alloc_with_ib(adev, entity: &adev->mman.high_pr, |
68 | AMDGPU_FENCE_OWNER_UNDEFINED, |
69 | size: num_dw * 4 + num_bytes, |
70 | pool_type: AMDGPU_IB_POOL_DELAYED, |
71 | job: &job); |
72 | if (r) |
73 | return r; |
74 | |
75 | src_addr = num_dw * 4; |
76 | src_addr += job->ibs[0].gpu_addr; |
77 | |
78 | dst_addr = amdgpu_bo_gpu_offset(bo: adev->gart.bo); |
79 | amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, |
80 | dst_addr, num_bytes, false); |
81 | |
82 | amdgpu_ring_pad_ib(ring, &job->ibs[0]); |
83 | WARN_ON(job->ibs[0].length_dw > num_dw); |
84 | |
85 | pte_flags = AMDGPU_PTE_VALID | AMDGPU_PTE_READABLE; |
86 | pte_flags |= AMDGPU_PTE_SYSTEM | AMDGPU_PTE_SNOOPED; |
87 | if (!(flags & KFD_IOCTL_SVM_FLAG_GPU_RO)) |
88 | pte_flags |= AMDGPU_PTE_WRITEABLE; |
89 | pte_flags |= adev->gart.gart_pte_flags; |
90 | |
91 | cpu_addr = &job->ibs[0].ptr[num_dw]; |
92 | |
93 | amdgpu_gart_map(adev, offset: 0, pages: npages, dma_addr: addr, flags: pte_flags, dst: cpu_addr); |
94 | fence = amdgpu_job_submit(job); |
95 | dma_fence_put(fence); |
96 | |
97 | return r; |
98 | } |
99 | |
100 | /** |
101 | * svm_migrate_copy_memory_gart - sdma copy data between ram and vram |
102 | * |
103 | * @adev: amdgpu device the sdma ring running |
104 | * @sys: system DMA pointer to be copied |
105 | * @vram: vram destination DMA pointer |
106 | * @npages: number of pages to copy |
107 | * @direction: enum MIGRATION_COPY_DIR |
108 | * @mfence: output, sdma fence to signal after sdma is done |
109 | * |
110 | * ram address uses GART table continuous entries mapping to ram pages, |
111 | * vram address uses direct mapping of vram pages, which must have npages |
112 | * number of continuous pages. |
113 | * GART update and sdma uses same buf copy function ring, sdma is splited to |
114 | * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait for |
115 | * the last sdma finish fence which is returned to check copy memory is done. |
116 | * |
117 | * Context: Process context, takes and releases gtt_window_lock |
118 | * |
119 | * Return: |
120 | * 0 - OK, otherwise error code |
121 | */ |
122 | |
123 | static int |
124 | svm_migrate_copy_memory_gart(struct amdgpu_device *adev, dma_addr_t *sys, |
125 | uint64_t *vram, uint64_t npages, |
126 | enum MIGRATION_COPY_DIR direction, |
127 | struct dma_fence **mfence) |
128 | { |
129 | const uint64_t GTT_MAX_PAGES = AMDGPU_GTT_MAX_TRANSFER_SIZE; |
130 | struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; |
131 | uint64_t gart_s, gart_d; |
132 | struct dma_fence *next; |
133 | uint64_t size; |
134 | int r; |
135 | |
136 | mutex_lock(&adev->mman.gtt_window_lock); |
137 | |
138 | while (npages) { |
139 | size = min(GTT_MAX_PAGES, npages); |
140 | |
141 | if (direction == FROM_VRAM_TO_RAM) { |
142 | gart_s = svm_migrate_direct_mapping_addr(adev, addr: *vram); |
143 | r = svm_migrate_gart_map(ring, npages: size, addr: sys, gart_addr: &gart_d, flags: 0); |
144 | |
145 | } else if (direction == FROM_RAM_TO_VRAM) { |
146 | r = svm_migrate_gart_map(ring, npages: size, addr: sys, gart_addr: &gart_s, |
147 | KFD_IOCTL_SVM_FLAG_GPU_RO); |
148 | gart_d = svm_migrate_direct_mapping_addr(adev, addr: *vram); |
149 | } |
150 | if (r) { |
151 | dev_err(adev->dev, "fail %d create gart mapping\n" , r); |
152 | goto out_unlock; |
153 | } |
154 | |
155 | r = amdgpu_copy_buffer(ring, src_offset: gart_s, dst_offset: gart_d, byte_count: size * PAGE_SIZE, |
156 | NULL, fence: &next, direct_submit: false, vm_needs_flush: true, tmz: false); |
157 | if (r) { |
158 | dev_err(adev->dev, "fail %d to copy memory\n" , r); |
159 | goto out_unlock; |
160 | } |
161 | |
162 | dma_fence_put(fence: *mfence); |
163 | *mfence = next; |
164 | npages -= size; |
165 | if (npages) { |
166 | sys += size; |
167 | vram += size; |
168 | } |
169 | } |
170 | |
171 | out_unlock: |
172 | mutex_unlock(lock: &adev->mman.gtt_window_lock); |
173 | |
174 | return r; |
175 | } |
176 | |
177 | /** |
178 | * svm_migrate_copy_done - wait for memory copy sdma is done |
179 | * |
180 | * @adev: amdgpu device the sdma memory copy is executing on |
181 | * @mfence: migrate fence |
182 | * |
183 | * Wait for dma fence is signaled, if the copy ssplit into multiple sdma |
184 | * operations, this is the last sdma operation fence. |
185 | * |
186 | * Context: called after svm_migrate_copy_memory |
187 | * |
188 | * Return: |
189 | * 0 - success |
190 | * otherwise - error code from dma fence signal |
191 | */ |
192 | static int |
193 | svm_migrate_copy_done(struct amdgpu_device *adev, struct dma_fence *mfence) |
194 | { |
195 | int r = 0; |
196 | |
197 | if (mfence) { |
198 | r = dma_fence_wait(fence: mfence, intr: false); |
199 | dma_fence_put(fence: mfence); |
200 | pr_debug("sdma copy memory fence done\n" ); |
201 | } |
202 | |
203 | return r; |
204 | } |
205 | |
206 | unsigned long |
207 | svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr) |
208 | { |
209 | return (addr + adev->kfd.pgmap.range.start) >> PAGE_SHIFT; |
210 | } |
211 | |
212 | static void |
213 | svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) |
214 | { |
215 | struct page *page; |
216 | |
217 | page = pfn_to_page(pfn); |
218 | svm_range_bo_ref(svm_bo: prange->svm_bo); |
219 | page->zone_device_data = prange->svm_bo; |
220 | zone_device_page_init(page); |
221 | } |
222 | |
223 | static void |
224 | svm_migrate_put_vram_page(struct amdgpu_device *adev, unsigned long addr) |
225 | { |
226 | struct page *page; |
227 | |
228 | page = pfn_to_page(svm_migrate_addr_to_pfn(adev, addr)); |
229 | unlock_page(page); |
230 | put_page(page); |
231 | } |
232 | |
233 | static unsigned long |
234 | svm_migrate_addr(struct amdgpu_device *adev, struct page *page) |
235 | { |
236 | unsigned long addr; |
237 | |
238 | addr = page_to_pfn(page) << PAGE_SHIFT; |
239 | return (addr - adev->kfd.pgmap.range.start); |
240 | } |
241 | |
242 | static struct page * |
243 | svm_migrate_get_sys_page(struct vm_area_struct *vma, unsigned long addr) |
244 | { |
245 | struct page *page; |
246 | |
247 | page = alloc_page_vma(GFP_HIGHUSER, vma, addr); |
248 | if (page) |
249 | lock_page(page); |
250 | |
251 | return page; |
252 | } |
253 | |
254 | static void svm_migrate_put_sys_page(unsigned long addr) |
255 | { |
256 | struct page *page; |
257 | |
258 | page = pfn_to_page(addr >> PAGE_SHIFT); |
259 | unlock_page(page); |
260 | put_page(page); |
261 | } |
262 | |
263 | static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma *migrate) |
264 | { |
265 | unsigned long upages = 0; |
266 | unsigned long i; |
267 | |
268 | for (i = 0; i < migrate->npages; i++) { |
269 | if (migrate->src[i] & MIGRATE_PFN_VALID && |
270 | !(migrate->src[i] & MIGRATE_PFN_MIGRATE)) |
271 | upages++; |
272 | } |
273 | return upages; |
274 | } |
275 | |
276 | static int |
277 | svm_migrate_copy_to_vram(struct kfd_node *node, struct svm_range *prange, |
278 | struct migrate_vma *migrate, struct dma_fence **mfence, |
279 | dma_addr_t *scratch, uint64_t ttm_res_offset) |
280 | { |
281 | uint64_t npages = migrate->cpages; |
282 | struct amdgpu_device *adev = node->adev; |
283 | struct device *dev = adev->dev; |
284 | struct amdgpu_res_cursor cursor; |
285 | dma_addr_t *src; |
286 | uint64_t *dst; |
287 | uint64_t i, j; |
288 | int r; |
289 | |
290 | pr_debug("svms 0x%p [0x%lx 0x%lx 0x%llx]\n" , prange->svms, prange->start, |
291 | prange->last, ttm_res_offset); |
292 | |
293 | src = scratch; |
294 | dst = (uint64_t *)(scratch + npages); |
295 | |
296 | amdgpu_res_first(res: prange->ttm_res, start: ttm_res_offset, |
297 | size: npages << PAGE_SHIFT, cur: &cursor); |
298 | for (i = j = 0; i < npages; i++) { |
299 | struct page *spage; |
300 | |
301 | dst[i] = cursor.start + (j << PAGE_SHIFT); |
302 | migrate->dst[i] = svm_migrate_addr_to_pfn(adev, addr: dst[i]); |
303 | svm_migrate_get_vram_page(prange, pfn: migrate->dst[i]); |
304 | migrate->dst[i] = migrate_pfn(pfn: migrate->dst[i]); |
305 | |
306 | spage = migrate_pfn_to_page(mpfn: migrate->src[i]); |
307 | if (spage && !is_zone_device_page(page: spage)) { |
308 | src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE, |
309 | DMA_TO_DEVICE); |
310 | r = dma_mapping_error(dev, dma_addr: src[i]); |
311 | if (r) { |
312 | dev_err(dev, "%s: fail %d dma_map_page\n" , |
313 | __func__, r); |
314 | goto out_free_vram_pages; |
315 | } |
316 | } else { |
317 | if (j) { |
318 | r = svm_migrate_copy_memory_gart( |
319 | adev, sys: src + i - j, |
320 | vram: dst + i - j, npages: j, |
321 | direction: FROM_RAM_TO_VRAM, |
322 | mfence); |
323 | if (r) |
324 | goto out_free_vram_pages; |
325 | amdgpu_res_next(cur: &cursor, size: (j + 1) << PAGE_SHIFT); |
326 | j = 0; |
327 | } else { |
328 | amdgpu_res_next(cur: &cursor, PAGE_SIZE); |
329 | } |
330 | continue; |
331 | } |
332 | |
333 | pr_debug_ratelimited("dma mapping src to 0x%llx, pfn 0x%lx\n" , |
334 | src[i] >> PAGE_SHIFT, page_to_pfn(spage)); |
335 | |
336 | if (j >= (cursor.size >> PAGE_SHIFT) - 1 && i < npages - 1) { |
337 | r = svm_migrate_copy_memory_gart(adev, sys: src + i - j, |
338 | vram: dst + i - j, npages: j + 1, |
339 | direction: FROM_RAM_TO_VRAM, |
340 | mfence); |
341 | if (r) |
342 | goto out_free_vram_pages; |
343 | amdgpu_res_next(cur: &cursor, size: (j + 1) * PAGE_SIZE); |
344 | j = 0; |
345 | } else { |
346 | j++; |
347 | } |
348 | } |
349 | |
350 | r = svm_migrate_copy_memory_gart(adev, sys: src + i - j, vram: dst + i - j, npages: j, |
351 | direction: FROM_RAM_TO_VRAM, mfence); |
352 | |
353 | out_free_vram_pages: |
354 | if (r) { |
355 | pr_debug("failed %d to copy memory to vram\n" , r); |
356 | while (i--) { |
357 | svm_migrate_put_vram_page(adev, addr: dst[i]); |
358 | migrate->dst[i] = 0; |
359 | } |
360 | } |
361 | |
362 | #ifdef DEBUG_FORCE_MIXED_DOMAINS |
363 | for (i = 0, j = 0; i < npages; i += 4, j++) { |
364 | if (j & 1) |
365 | continue; |
366 | svm_migrate_put_vram_page(adev, dst[i]); |
367 | migrate->dst[i] = 0; |
368 | svm_migrate_put_vram_page(adev, dst[i + 1]); |
369 | migrate->dst[i + 1] = 0; |
370 | svm_migrate_put_vram_page(adev, dst[i + 2]); |
371 | migrate->dst[i + 2] = 0; |
372 | svm_migrate_put_vram_page(adev, dst[i + 3]); |
373 | migrate->dst[i + 3] = 0; |
374 | } |
375 | #endif |
376 | |
377 | return r; |
378 | } |
379 | |
380 | static long |
381 | svm_migrate_vma_to_vram(struct kfd_node *node, struct svm_range *prange, |
382 | struct vm_area_struct *vma, uint64_t start, |
383 | uint64_t end, uint32_t trigger, uint64_t ttm_res_offset) |
384 | { |
385 | struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); |
386 | uint64_t npages = (end - start) >> PAGE_SHIFT; |
387 | struct amdgpu_device *adev = node->adev; |
388 | struct kfd_process_device *pdd; |
389 | struct dma_fence *mfence = NULL; |
390 | struct migrate_vma migrate = { 0 }; |
391 | unsigned long cpages = 0; |
392 | unsigned long mpages = 0; |
393 | dma_addr_t *scratch; |
394 | void *buf; |
395 | int r = -ENOMEM; |
396 | |
397 | memset(&migrate, 0, sizeof(migrate)); |
398 | migrate.vma = vma; |
399 | migrate.start = start; |
400 | migrate.end = end; |
401 | migrate.flags = MIGRATE_VMA_SELECT_SYSTEM; |
402 | migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); |
403 | |
404 | buf = kvcalloc(n: npages, |
405 | size: 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t), |
406 | GFP_KERNEL); |
407 | if (!buf) |
408 | goto out; |
409 | |
410 | migrate.src = buf; |
411 | migrate.dst = migrate.src + npages; |
412 | scratch = (dma_addr_t *)(migrate.dst + npages); |
413 | |
414 | kfd_smi_event_migration_start(node, pid: p->lead_thread->pid, |
415 | start: start >> PAGE_SHIFT, end: end >> PAGE_SHIFT, |
416 | from: 0, to: node->id, prefetch_loc: prange->prefetch_loc, |
417 | preferred_loc: prange->preferred_loc, trigger); |
418 | |
419 | r = migrate_vma_setup(args: &migrate); |
420 | if (r) { |
421 | dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n" , |
422 | __func__, r, prange->start, prange->last); |
423 | goto out_free; |
424 | } |
425 | |
426 | cpages = migrate.cpages; |
427 | if (!cpages) { |
428 | pr_debug("failed collect migrate sys pages [0x%lx 0x%lx]\n" , |
429 | prange->start, prange->last); |
430 | goto out_free; |
431 | } |
432 | if (cpages != npages) |
433 | pr_debug("partial migration, 0x%lx/0x%llx pages collected\n" , |
434 | cpages, npages); |
435 | else |
436 | pr_debug("0x%lx pages collected\n" , cpages); |
437 | |
438 | r = svm_migrate_copy_to_vram(node, prange, migrate: &migrate, mfence: &mfence, scratch, ttm_res_offset); |
439 | migrate_vma_pages(migrate: &migrate); |
440 | |
441 | svm_migrate_copy_done(adev, mfence); |
442 | migrate_vma_finalize(migrate: &migrate); |
443 | |
444 | mpages = cpages - svm_migrate_unsuccessful_pages(migrate: &migrate); |
445 | pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n" , |
446 | mpages, cpages, migrate.npages); |
447 | |
448 | kfd_smi_event_migration_end(node, pid: p->lead_thread->pid, |
449 | start: start >> PAGE_SHIFT, end: end >> PAGE_SHIFT, |
450 | from: 0, to: node->id, trigger); |
451 | |
452 | svm_range_dma_unmap_dev(dev: adev->dev, dma_addr: scratch, offset: 0, npages); |
453 | |
454 | out_free: |
455 | kvfree(addr: buf); |
456 | out: |
457 | if (!r && mpages) { |
458 | pdd = svm_range_get_pdd_by_node(prange, node); |
459 | if (pdd) |
460 | WRITE_ONCE(pdd->page_in, pdd->page_in + mpages); |
461 | |
462 | return mpages; |
463 | } |
464 | return r; |
465 | } |
466 | |
467 | /** |
468 | * svm_migrate_ram_to_vram - migrate svm range from system to device |
469 | * @prange: range structure |
470 | * @best_loc: the device to migrate to |
471 | * @start_mgr: start page to migrate |
472 | * @last_mgr: last page to migrate |
473 | * @mm: the process mm structure |
474 | * @trigger: reason of migration |
475 | * |
476 | * Context: Process context, caller hold mmap read lock, svms lock, prange lock |
477 | * |
478 | * Return: |
479 | * 0 - OK, otherwise error code |
480 | */ |
481 | static int |
482 | svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, |
483 | unsigned long start_mgr, unsigned long last_mgr, |
484 | struct mm_struct *mm, uint32_t trigger) |
485 | { |
486 | unsigned long addr, start, end; |
487 | struct vm_area_struct *vma; |
488 | uint64_t ttm_res_offset; |
489 | struct kfd_node *node; |
490 | unsigned long mpages = 0; |
491 | long r = 0; |
492 | |
493 | if (start_mgr < prange->start || last_mgr > prange->last) { |
494 | pr_debug("range [0x%lx 0x%lx] out prange [0x%lx 0x%lx]\n" , |
495 | start_mgr, last_mgr, prange->start, prange->last); |
496 | return -EFAULT; |
497 | } |
498 | |
499 | node = svm_range_get_node_by_id(prange, gpu_id: best_loc); |
500 | if (!node) { |
501 | pr_debug("failed to get kfd node by id 0x%x\n" , best_loc); |
502 | return -ENODEV; |
503 | } |
504 | |
505 | pr_debug("svms 0x%p [0x%lx 0x%lx] in [0x%lx 0x%lx] to gpu 0x%x\n" , |
506 | prange->svms, start_mgr, last_mgr, prange->start, prange->last, |
507 | best_loc); |
508 | |
509 | start = start_mgr << PAGE_SHIFT; |
510 | end = (last_mgr + 1) << PAGE_SHIFT; |
511 | |
512 | r = svm_range_vram_node_new(node, prange, clear: true); |
513 | if (r) { |
514 | dev_dbg(node->adev->dev, "fail %ld to alloc vram\n" , r); |
515 | return r; |
516 | } |
517 | ttm_res_offset = (start_mgr - prange->start + prange->offset) << PAGE_SHIFT; |
518 | |
519 | for (addr = start; addr < end;) { |
520 | unsigned long next; |
521 | |
522 | vma = vma_lookup(mm, addr); |
523 | if (!vma) |
524 | break; |
525 | |
526 | next = min(vma->vm_end, end); |
527 | r = svm_migrate_vma_to_vram(node, prange, vma, start: addr, end: next, trigger, ttm_res_offset); |
528 | if (r < 0) { |
529 | pr_debug("failed %ld to migrate\n" , r); |
530 | break; |
531 | } else { |
532 | mpages += r; |
533 | } |
534 | ttm_res_offset += next - addr; |
535 | addr = next; |
536 | } |
537 | |
538 | if (mpages) { |
539 | prange->actual_loc = best_loc; |
540 | prange->vram_pages += mpages; |
541 | } else if (!prange->actual_loc) { |
542 | /* if no page migrated and all pages from prange are at |
543 | * sys ram drop svm_bo got from svm_range_vram_node_new |
544 | */ |
545 | svm_range_vram_node_free(prange); |
546 | } |
547 | |
548 | return r < 0 ? r : 0; |
549 | } |
550 | |
551 | static void svm_migrate_page_free(struct page *page) |
552 | { |
553 | struct svm_range_bo *svm_bo = page->zone_device_data; |
554 | |
555 | if (svm_bo) { |
556 | pr_debug_ratelimited("ref: %d\n" , kref_read(&svm_bo->kref)); |
557 | svm_range_bo_unref_async(svm_bo); |
558 | } |
559 | } |
560 | |
561 | static int |
562 | svm_migrate_copy_to_ram(struct amdgpu_device *adev, struct svm_range *prange, |
563 | struct migrate_vma *migrate, struct dma_fence **mfence, |
564 | dma_addr_t *scratch, uint64_t npages) |
565 | { |
566 | struct device *dev = adev->dev; |
567 | uint64_t *src; |
568 | dma_addr_t *dst; |
569 | struct page *dpage; |
570 | uint64_t i = 0, j; |
571 | uint64_t addr; |
572 | int r = 0; |
573 | |
574 | pr_debug("svms 0x%p [0x%lx 0x%lx]\n" , prange->svms, prange->start, |
575 | prange->last); |
576 | |
577 | addr = migrate->start; |
578 | |
579 | src = (uint64_t *)(scratch + npages); |
580 | dst = scratch; |
581 | |
582 | for (i = 0, j = 0; i < npages; i++, addr += PAGE_SIZE) { |
583 | struct page *spage; |
584 | |
585 | spage = migrate_pfn_to_page(mpfn: migrate->src[i]); |
586 | if (!spage || !is_zone_device_page(page: spage)) { |
587 | pr_debug("invalid page. Could be in CPU already svms 0x%p [0x%lx 0x%lx]\n" , |
588 | prange->svms, prange->start, prange->last); |
589 | if (j) { |
590 | r = svm_migrate_copy_memory_gart(adev, sys: dst + i - j, |
591 | vram: src + i - j, npages: j, |
592 | direction: FROM_VRAM_TO_RAM, |
593 | mfence); |
594 | if (r) |
595 | goto out_oom; |
596 | j = 0; |
597 | } |
598 | continue; |
599 | } |
600 | src[i] = svm_migrate_addr(adev, page: spage); |
601 | if (j > 0 && src[i] != src[i - 1] + PAGE_SIZE) { |
602 | r = svm_migrate_copy_memory_gart(adev, sys: dst + i - j, |
603 | vram: src + i - j, npages: j, |
604 | direction: FROM_VRAM_TO_RAM, |
605 | mfence); |
606 | if (r) |
607 | goto out_oom; |
608 | j = 0; |
609 | } |
610 | |
611 | dpage = svm_migrate_get_sys_page(vma: migrate->vma, addr); |
612 | if (!dpage) { |
613 | pr_debug("failed get page svms 0x%p [0x%lx 0x%lx]\n" , |
614 | prange->svms, prange->start, prange->last); |
615 | r = -ENOMEM; |
616 | goto out_oom; |
617 | } |
618 | |
619 | dst[i] = dma_map_page(dev, dpage, 0, PAGE_SIZE, DMA_FROM_DEVICE); |
620 | r = dma_mapping_error(dev, dma_addr: dst[i]); |
621 | if (r) { |
622 | dev_err(adev->dev, "%s: fail %d dma_map_page\n" , __func__, r); |
623 | goto out_oom; |
624 | } |
625 | |
626 | pr_debug_ratelimited("dma mapping dst to 0x%llx, pfn 0x%lx\n" , |
627 | dst[i] >> PAGE_SHIFT, page_to_pfn(dpage)); |
628 | |
629 | migrate->dst[i] = migrate_pfn(page_to_pfn(dpage)); |
630 | j++; |
631 | } |
632 | |
633 | r = svm_migrate_copy_memory_gart(adev, sys: dst + i - j, vram: src + i - j, npages: j, |
634 | direction: FROM_VRAM_TO_RAM, mfence); |
635 | |
636 | out_oom: |
637 | if (r) { |
638 | pr_debug("failed %d copy to ram\n" , r); |
639 | while (i--) { |
640 | svm_migrate_put_sys_page(addr: dst[i]); |
641 | migrate->dst[i] = 0; |
642 | } |
643 | } |
644 | |
645 | return r; |
646 | } |
647 | |
648 | /** |
649 | * svm_migrate_vma_to_ram - migrate range inside one vma from device to system |
650 | * |
651 | * @prange: svm range structure |
652 | * @vma: vm_area_struct that range [start, end] belongs to |
653 | * @start: range start virtual address in pages |
654 | * @end: range end virtual address in pages |
655 | * @node: kfd node device to migrate from |
656 | * @trigger: reason of migration |
657 | * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback |
658 | * |
659 | * Context: Process context, caller hold mmap read lock, prange->migrate_mutex |
660 | * |
661 | * Return: |
662 | * negative values - indicate error |
663 | * positive values or zero - number of pages got migrated |
664 | */ |
665 | static long |
666 | svm_migrate_vma_to_ram(struct kfd_node *node, struct svm_range *prange, |
667 | struct vm_area_struct *vma, uint64_t start, uint64_t end, |
668 | uint32_t trigger, struct page *fault_page) |
669 | { |
670 | struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); |
671 | uint64_t npages = (end - start) >> PAGE_SHIFT; |
672 | unsigned long upages = npages; |
673 | unsigned long cpages = 0; |
674 | unsigned long mpages = 0; |
675 | struct amdgpu_device *adev = node->adev; |
676 | struct kfd_process_device *pdd; |
677 | struct dma_fence *mfence = NULL; |
678 | struct migrate_vma migrate = { 0 }; |
679 | dma_addr_t *scratch; |
680 | void *buf; |
681 | int r = -ENOMEM; |
682 | |
683 | memset(&migrate, 0, sizeof(migrate)); |
684 | migrate.vma = vma; |
685 | migrate.start = start; |
686 | migrate.end = end; |
687 | migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev); |
688 | if (adev->gmc.xgmi.connected_to_cpu) |
689 | migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT; |
690 | else |
691 | migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; |
692 | |
693 | buf = kvcalloc(n: npages, |
694 | size: 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t), |
695 | GFP_KERNEL); |
696 | if (!buf) |
697 | goto out; |
698 | |
699 | migrate.src = buf; |
700 | migrate.dst = migrate.src + npages; |
701 | migrate.fault_page = fault_page; |
702 | scratch = (dma_addr_t *)(migrate.dst + npages); |
703 | |
704 | kfd_smi_event_migration_start(node, pid: p->lead_thread->pid, |
705 | start: start >> PAGE_SHIFT, end: end >> PAGE_SHIFT, |
706 | from: node->id, to: 0, prefetch_loc: prange->prefetch_loc, |
707 | preferred_loc: prange->preferred_loc, trigger); |
708 | |
709 | r = migrate_vma_setup(args: &migrate); |
710 | if (r) { |
711 | dev_err(adev->dev, "%s: vma setup fail %d range [0x%lx 0x%lx]\n" , |
712 | __func__, r, prange->start, prange->last); |
713 | goto out_free; |
714 | } |
715 | |
716 | cpages = migrate.cpages; |
717 | if (!cpages) { |
718 | pr_debug("failed collect migrate device pages [0x%lx 0x%lx]\n" , |
719 | prange->start, prange->last); |
720 | upages = svm_migrate_unsuccessful_pages(migrate: &migrate); |
721 | goto out_free; |
722 | } |
723 | if (cpages != npages) |
724 | pr_debug("partial migration, 0x%lx/0x%llx pages collected\n" , |
725 | cpages, npages); |
726 | else |
727 | pr_debug("0x%lx pages collected\n" , cpages); |
728 | |
729 | r = svm_migrate_copy_to_ram(adev, prange, migrate: &migrate, mfence: &mfence, |
730 | scratch, npages); |
731 | migrate_vma_pages(migrate: &migrate); |
732 | |
733 | upages = svm_migrate_unsuccessful_pages(migrate: &migrate); |
734 | pr_debug("unsuccessful/cpages/npages 0x%lx/0x%lx/0x%lx\n" , |
735 | upages, cpages, migrate.npages); |
736 | |
737 | svm_migrate_copy_done(adev, mfence); |
738 | migrate_vma_finalize(migrate: &migrate); |
739 | |
740 | kfd_smi_event_migration_end(node, pid: p->lead_thread->pid, |
741 | start: start >> PAGE_SHIFT, end: end >> PAGE_SHIFT, |
742 | from: node->id, to: 0, trigger); |
743 | |
744 | svm_range_dma_unmap_dev(dev: adev->dev, dma_addr: scratch, offset: 0, npages); |
745 | |
746 | out_free: |
747 | kvfree(addr: buf); |
748 | out: |
749 | if (!r && cpages) { |
750 | mpages = cpages - upages; |
751 | pdd = svm_range_get_pdd_by_node(prange, node); |
752 | if (pdd) |
753 | WRITE_ONCE(pdd->page_out, pdd->page_out + mpages); |
754 | } |
755 | |
756 | return r ? r : mpages; |
757 | } |
758 | |
759 | /** |
760 | * svm_migrate_vram_to_ram - migrate svm range from device to system |
761 | * @prange: range structure |
762 | * @mm: process mm, use current->mm if NULL |
763 | * @start_mgr: start page need be migrated to sys ram |
764 | * @last_mgr: last page need be migrated to sys ram |
765 | * @trigger: reason of migration |
766 | * @fault_page: is from vmf->page, svm_migrate_to_ram(), this is CPU page fault callback |
767 | * |
768 | * Context: Process context, caller hold mmap read lock, prange->migrate_mutex |
769 | * |
770 | * Return: |
771 | * 0 - OK, otherwise error code |
772 | */ |
773 | int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, |
774 | unsigned long start_mgr, unsigned long last_mgr, |
775 | uint32_t trigger, struct page *fault_page) |
776 | { |
777 | struct kfd_node *node; |
778 | struct vm_area_struct *vma; |
779 | unsigned long addr; |
780 | unsigned long start; |
781 | unsigned long end; |
782 | unsigned long mpages = 0; |
783 | long r = 0; |
784 | |
785 | /* this pragne has no any vram page to migrate to sys ram */ |
786 | if (!prange->actual_loc) { |
787 | pr_debug("[0x%lx 0x%lx] already migrated to ram\n" , |
788 | prange->start, prange->last); |
789 | return 0; |
790 | } |
791 | |
792 | if (start_mgr < prange->start || last_mgr > prange->last) { |
793 | pr_debug("range [0x%lx 0x%lx] out prange [0x%lx 0x%lx]\n" , |
794 | start_mgr, last_mgr, prange->start, prange->last); |
795 | return -EFAULT; |
796 | } |
797 | |
798 | node = svm_range_get_node_by_id(prange, gpu_id: prange->actual_loc); |
799 | if (!node) { |
800 | pr_debug("failed to get kfd node by id 0x%x\n" , prange->actual_loc); |
801 | return -ENODEV; |
802 | } |
803 | pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] from gpu 0x%x to ram\n" , |
804 | prange->svms, prange, start_mgr, last_mgr, |
805 | prange->actual_loc); |
806 | |
807 | start = start_mgr << PAGE_SHIFT; |
808 | end = (last_mgr + 1) << PAGE_SHIFT; |
809 | |
810 | for (addr = start; addr < end;) { |
811 | unsigned long next; |
812 | |
813 | vma = vma_lookup(mm, addr); |
814 | if (!vma) { |
815 | pr_debug("failed to find vma for prange %p\n" , prange); |
816 | r = -EFAULT; |
817 | break; |
818 | } |
819 | |
820 | next = min(vma->vm_end, end); |
821 | r = svm_migrate_vma_to_ram(node, prange, vma, start: addr, end: next, trigger, |
822 | fault_page); |
823 | if (r < 0) { |
824 | pr_debug("failed %ld to migrate prange %p\n" , r, prange); |
825 | break; |
826 | } else { |
827 | mpages += r; |
828 | } |
829 | addr = next; |
830 | } |
831 | |
832 | if (r >= 0) { |
833 | prange->vram_pages -= mpages; |
834 | |
835 | /* prange does not have vram page set its actual_loc to system |
836 | * and drop its svm_bo ref |
837 | */ |
838 | if (prange->vram_pages == 0 && prange->ttm_res) { |
839 | prange->actual_loc = 0; |
840 | svm_range_vram_node_free(prange); |
841 | } |
842 | } |
843 | |
844 | return r < 0 ? r : 0; |
845 | } |
846 | |
847 | /** |
848 | * svm_migrate_vram_to_vram - migrate svm range from device to device |
849 | * @prange: range structure |
850 | * @best_loc: the device to migrate to |
851 | * @start: start page need be migrated to sys ram |
852 | * @last: last page need be migrated to sys ram |
853 | * @mm: process mm, use current->mm if NULL |
854 | * @trigger: reason of migration |
855 | * |
856 | * Context: Process context, caller hold mmap read lock, svms lock, prange lock |
857 | * |
858 | * migrate all vram pages in prange to sys ram, then migrate |
859 | * [start, last] pages from sys ram to gpu node best_loc. |
860 | * |
861 | * Return: |
862 | * 0 - OK, otherwise error code |
863 | */ |
864 | static int |
865 | svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, |
866 | unsigned long start, unsigned long last, |
867 | struct mm_struct *mm, uint32_t trigger) |
868 | { |
869 | int r, retries = 3; |
870 | |
871 | /* |
872 | * TODO: for both devices with PCIe large bar or on same xgmi hive, skip |
873 | * system memory as migration bridge |
874 | */ |
875 | |
876 | pr_debug("from gpu 0x%x to gpu 0x%x\n" , prange->actual_loc, best_loc); |
877 | |
878 | do { |
879 | r = svm_migrate_vram_to_ram(prange, mm, start_mgr: prange->start, last_mgr: prange->last, |
880 | trigger, NULL); |
881 | if (r) |
882 | return r; |
883 | } while (prange->actual_loc && --retries); |
884 | |
885 | if (prange->actual_loc) |
886 | return -EDEADLK; |
887 | |
888 | return svm_migrate_ram_to_vram(prange, best_loc, start_mgr: start, last_mgr: last, mm, trigger); |
889 | } |
890 | |
891 | int |
892 | svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, |
893 | unsigned long start, unsigned long last, |
894 | struct mm_struct *mm, uint32_t trigger) |
895 | { |
896 | if (!prange->actual_loc || prange->actual_loc == best_loc) |
897 | return svm_migrate_ram_to_vram(prange, best_loc, start_mgr: start, last_mgr: last, |
898 | mm, trigger); |
899 | |
900 | else |
901 | return svm_migrate_vram_to_vram(prange, best_loc, start, last, |
902 | mm, trigger); |
903 | |
904 | } |
905 | |
906 | /** |
907 | * svm_migrate_to_ram - CPU page fault handler |
908 | * @vmf: CPU vm fault vma, address |
909 | * |
910 | * Context: vm fault handler, caller holds the mmap read lock |
911 | * |
912 | * Return: |
913 | * 0 - OK |
914 | * VM_FAULT_SIGBUS - notice application to have SIGBUS page fault |
915 | */ |
916 | static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) |
917 | { |
918 | unsigned long start, last, size; |
919 | unsigned long addr = vmf->address; |
920 | struct svm_range_bo *svm_bo; |
921 | struct svm_range *prange; |
922 | struct kfd_process *p; |
923 | struct mm_struct *mm; |
924 | int r = 0; |
925 | |
926 | svm_bo = vmf->page->zone_device_data; |
927 | if (!svm_bo) { |
928 | pr_debug("failed get device page at addr 0x%lx\n" , addr); |
929 | return VM_FAULT_SIGBUS; |
930 | } |
931 | if (!mmget_not_zero(mm: svm_bo->eviction_fence->mm)) { |
932 | pr_debug("addr 0x%lx of process mm is destroyed\n" , addr); |
933 | return VM_FAULT_SIGBUS; |
934 | } |
935 | |
936 | mm = svm_bo->eviction_fence->mm; |
937 | if (mm != vmf->vma->vm_mm) |
938 | pr_debug("addr 0x%lx is COW mapping in child process\n" , addr); |
939 | |
940 | p = kfd_lookup_process_by_mm(mm); |
941 | if (!p) { |
942 | pr_debug("failed find process at fault address 0x%lx\n" , addr); |
943 | r = VM_FAULT_SIGBUS; |
944 | goto out_mmput; |
945 | } |
946 | if (READ_ONCE(p->svms.faulting_task) == current) { |
947 | pr_debug("skipping ram migration\n" ); |
948 | r = 0; |
949 | goto out_unref_process; |
950 | } |
951 | |
952 | pr_debug("CPU page fault svms 0x%p address 0x%lx\n" , &p->svms, addr); |
953 | addr >>= PAGE_SHIFT; |
954 | |
955 | mutex_lock(&p->svms.lock); |
956 | |
957 | prange = svm_range_from_addr(svms: &p->svms, addr, NULL); |
958 | if (!prange) { |
959 | pr_debug("failed get range svms 0x%p addr 0x%lx\n" , &p->svms, addr); |
960 | r = -EFAULT; |
961 | goto out_unlock_svms; |
962 | } |
963 | |
964 | mutex_lock(&prange->migrate_mutex); |
965 | |
966 | if (!prange->actual_loc) |
967 | goto out_unlock_prange; |
968 | |
969 | /* Align migration range start and size to granularity size */ |
970 | size = 1UL << prange->granularity; |
971 | start = max(ALIGN_DOWN(addr, size), prange->start); |
972 | last = min(ALIGN(addr + 1, size) - 1, prange->last); |
973 | |
974 | r = svm_migrate_vram_to_ram(prange, mm: vmf->vma->vm_mm, start_mgr: start, last_mgr: last, |
975 | trigger: KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, fault_page: vmf->page); |
976 | if (r) |
977 | pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n" , |
978 | r, prange->svms, prange, start, last); |
979 | |
980 | out_unlock_prange: |
981 | mutex_unlock(lock: &prange->migrate_mutex); |
982 | out_unlock_svms: |
983 | mutex_unlock(lock: &p->svms.lock); |
984 | out_unref_process: |
985 | pr_debug("CPU fault svms 0x%p address 0x%lx done\n" , &p->svms, addr); |
986 | kfd_unref_process(p); |
987 | out_mmput: |
988 | mmput(mm); |
989 | return r ? VM_FAULT_SIGBUS : 0; |
990 | } |
991 | |
992 | static const struct dev_pagemap_ops svm_migrate_pgmap_ops = { |
993 | .page_free = svm_migrate_page_free, |
994 | .migrate_to_ram = svm_migrate_to_ram, |
995 | }; |
996 | |
997 | /* Each VRAM page uses sizeof(struct page) on system memory */ |
998 | #define SVM_HMM_PAGE_STRUCT_SIZE(size) ((size)/PAGE_SIZE * sizeof(struct page)) |
999 | |
1000 | int kgd2kfd_init_zone_device(struct amdgpu_device *adev) |
1001 | { |
1002 | struct amdgpu_kfd_dev *kfddev = &adev->kfd; |
1003 | struct dev_pagemap *pgmap; |
1004 | struct resource *res = NULL; |
1005 | unsigned long size; |
1006 | void *r; |
1007 | |
1008 | /* Page migration works on gfx9 or newer */ |
1009 | if (amdgpu_ip_version(adev, ip: GC_HWIP, inst: 0) < IP_VERSION(9, 0, 1)) |
1010 | return -EINVAL; |
1011 | |
1012 | if (adev->gmc.is_app_apu) |
1013 | return 0; |
1014 | |
1015 | pgmap = &kfddev->pgmap; |
1016 | memset(pgmap, 0, sizeof(*pgmap)); |
1017 | |
1018 | /* TODO: register all vram to HMM for now. |
1019 | * should remove reserved size |
1020 | */ |
1021 | size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20); |
1022 | if (adev->gmc.xgmi.connected_to_cpu) { |
1023 | pgmap->range.start = adev->gmc.aper_base; |
1024 | pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 1; |
1025 | pgmap->type = MEMORY_DEVICE_COHERENT; |
1026 | } else { |
1027 | res = devm_request_free_mem_region(dev: adev->dev, base: &iomem_resource, size); |
1028 | if (IS_ERR(ptr: res)) |
1029 | return PTR_ERR(ptr: res); |
1030 | pgmap->range.start = res->start; |
1031 | pgmap->range.end = res->end; |
1032 | pgmap->type = MEMORY_DEVICE_PRIVATE; |
1033 | } |
1034 | |
1035 | pgmap->nr_range = 1; |
1036 | pgmap->ops = &svm_migrate_pgmap_ops; |
1037 | pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev); |
1038 | pgmap->flags = 0; |
1039 | /* Device manager releases device-specific resources, memory region and |
1040 | * pgmap when driver disconnects from device. |
1041 | */ |
1042 | r = devm_memremap_pages(dev: adev->dev, pgmap); |
1043 | if (IS_ERR(ptr: r)) { |
1044 | pr_err("failed to register HMM device memory\n" ); |
1045 | if (pgmap->type == MEMORY_DEVICE_PRIVATE) |
1046 | devm_release_mem_region(adev->dev, res->start, resource_size(res)); |
1047 | /* Disable SVM support capability */ |
1048 | pgmap->type = 0; |
1049 | return PTR_ERR(ptr: r); |
1050 | } |
1051 | |
1052 | pr_debug("reserve %ldMB system memory for VRAM pages struct\n" , |
1053 | SVM_HMM_PAGE_STRUCT_SIZE(size) >> 20); |
1054 | |
1055 | amdgpu_amdkfd_reserve_system_mem(SVM_HMM_PAGE_STRUCT_SIZE(size)); |
1056 | |
1057 | pr_info("HMM registered %ldMB device memory\n" , size >> 20); |
1058 | |
1059 | return 0; |
1060 | } |
1061 | |