1 | /* |
2 | * Copyright 2023 Advanced Micro Devices, Inc. |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
17 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
18 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
19 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
20 | * OTHER DEALINGS IN THE SOFTWARE. |
21 | */ |
22 | |
23 | #include "kfd_debug.h" |
24 | #include "kfd_device_queue_manager.h" |
25 | #include "kfd_topology.h" |
26 | #include <linux/file.h> |
27 | #include <uapi/linux/kfd_ioctl.h> |
28 | |
29 | #define MAX_WATCH_ADDRESSES 4 |
30 | |
31 | int kfd_dbg_ev_query_debug_event(struct kfd_process *process, |
32 | unsigned int *queue_id, |
33 | unsigned int *gpu_id, |
34 | uint64_t exception_clear_mask, |
35 | uint64_t *event_status) |
36 | { |
37 | struct process_queue_manager *pqm; |
38 | struct process_queue_node *pqn; |
39 | int i; |
40 | |
41 | if (!(process && process->debug_trap_enabled)) |
42 | return -ENODATA; |
43 | |
44 | mutex_lock(&process->event_mutex); |
45 | *event_status = 0; |
46 | *queue_id = 0; |
47 | *gpu_id = 0; |
48 | |
49 | /* find and report queue events */ |
50 | pqm = &process->pqm; |
51 | list_for_each_entry(pqn, &pqm->queues, process_queue_list) { |
52 | uint64_t tmp = process->exception_enable_mask; |
53 | |
54 | if (!pqn->q) |
55 | continue; |
56 | |
57 | tmp &= pqn->q->properties.exception_status; |
58 | |
59 | if (!tmp) |
60 | continue; |
61 | |
62 | *event_status = pqn->q->properties.exception_status; |
63 | *queue_id = pqn->q->properties.queue_id; |
64 | *gpu_id = pqn->q->device->id; |
65 | pqn->q->properties.exception_status &= ~exception_clear_mask; |
66 | goto out; |
67 | } |
68 | |
69 | /* find and report device events */ |
70 | for (i = 0; i < process->n_pdds; i++) { |
71 | struct kfd_process_device *pdd = process->pdds[i]; |
72 | uint64_t tmp = process->exception_enable_mask |
73 | & pdd->exception_status; |
74 | |
75 | if (!tmp) |
76 | continue; |
77 | |
78 | *event_status = pdd->exception_status; |
79 | *gpu_id = pdd->dev->id; |
80 | pdd->exception_status &= ~exception_clear_mask; |
81 | goto out; |
82 | } |
83 | |
84 | /* report process events */ |
85 | if (process->exception_enable_mask & process->exception_status) { |
86 | *event_status = process->exception_status; |
87 | process->exception_status &= ~exception_clear_mask; |
88 | } |
89 | |
90 | out: |
91 | mutex_unlock(lock: &process->event_mutex); |
92 | return *event_status ? 0 : -EAGAIN; |
93 | } |
94 | |
95 | void debug_event_write_work_handler(struct work_struct *work) |
96 | { |
97 | struct kfd_process *process; |
98 | |
99 | static const char write_data = '.'; |
100 | loff_t pos = 0; |
101 | |
102 | process = container_of(work, |
103 | struct kfd_process, |
104 | debug_event_workarea); |
105 | |
106 | kernel_write(process->dbg_ev_file, &write_data, 1, &pos); |
107 | } |
108 | |
109 | /* update process/device/queue exception status, write to descriptor |
110 | * only if exception_status is enabled. |
111 | */ |
112 | bool kfd_dbg_ev_raise(uint64_t event_mask, |
113 | struct kfd_process *process, struct kfd_node *dev, |
114 | unsigned int source_id, bool use_worker, |
115 | void *exception_data, size_t exception_data_size) |
116 | { |
117 | struct process_queue_manager *pqm; |
118 | struct process_queue_node *pqn; |
119 | int i; |
120 | static const char write_data = '.'; |
121 | loff_t pos = 0; |
122 | bool is_subscribed = true; |
123 | |
124 | if (!(process && process->debug_trap_enabled)) |
125 | return false; |
126 | |
127 | mutex_lock(&process->event_mutex); |
128 | |
129 | if (event_mask & KFD_EC_MASK_DEVICE) { |
130 | for (i = 0; i < process->n_pdds; i++) { |
131 | struct kfd_process_device *pdd = process->pdds[i]; |
132 | |
133 | if (pdd->dev != dev) |
134 | continue; |
135 | |
136 | pdd->exception_status |= event_mask & KFD_EC_MASK_DEVICE; |
137 | |
138 | if (event_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { |
139 | if (!pdd->vm_fault_exc_data) { |
140 | pdd->vm_fault_exc_data = kmemdup( |
141 | p: exception_data, |
142 | size: exception_data_size, |
143 | GFP_KERNEL); |
144 | if (!pdd->vm_fault_exc_data) |
145 | pr_debug("Failed to allocate exception data memory" ); |
146 | } else { |
147 | pr_debug("Debugger exception data not saved\n" ); |
148 | print_hex_dump_bytes("exception data: " , |
149 | DUMP_PREFIX_OFFSET, |
150 | exception_data, |
151 | exception_data_size); |
152 | } |
153 | } |
154 | break; |
155 | } |
156 | } else if (event_mask & KFD_EC_MASK_PROCESS) { |
157 | process->exception_status |= event_mask & KFD_EC_MASK_PROCESS; |
158 | } else { |
159 | pqm = &process->pqm; |
160 | list_for_each_entry(pqn, &pqm->queues, |
161 | process_queue_list) { |
162 | int target_id; |
163 | |
164 | if (!pqn->q) |
165 | continue; |
166 | |
167 | target_id = event_mask & KFD_EC_MASK(EC_QUEUE_NEW) ? |
168 | pqn->q->properties.queue_id : |
169 | pqn->q->doorbell_id; |
170 | |
171 | if (pqn->q->device != dev || target_id != source_id) |
172 | continue; |
173 | |
174 | pqn->q->properties.exception_status |= event_mask; |
175 | break; |
176 | } |
177 | } |
178 | |
179 | if (process->exception_enable_mask & event_mask) { |
180 | if (use_worker) |
181 | schedule_work(work: &process->debug_event_workarea); |
182 | else |
183 | kernel_write(process->dbg_ev_file, |
184 | &write_data, |
185 | 1, |
186 | &pos); |
187 | } else { |
188 | is_subscribed = false; |
189 | } |
190 | |
191 | mutex_unlock(lock: &process->event_mutex); |
192 | |
193 | return is_subscribed; |
194 | } |
195 | |
196 | /* set pending event queue entry from ring entry */ |
197 | bool kfd_set_dbg_ev_from_interrupt(struct kfd_node *dev, |
198 | unsigned int pasid, |
199 | uint32_t doorbell_id, |
200 | uint64_t trap_mask, |
201 | void *exception_data, |
202 | size_t exception_data_size) |
203 | { |
204 | struct kfd_process *p; |
205 | bool signaled_to_debugger_or_runtime = false; |
206 | |
207 | p = kfd_lookup_process_by_pasid(pasid); |
208 | |
209 | if (!p) |
210 | return false; |
211 | |
212 | if (!kfd_dbg_ev_raise(event_mask: trap_mask, process: p, dev, source_id: doorbell_id, use_worker: true, |
213 | exception_data, exception_data_size)) { |
214 | struct process_queue_manager *pqm; |
215 | struct process_queue_node *pqn; |
216 | |
217 | if (!!(trap_mask & KFD_EC_MASK_QUEUE) && |
218 | p->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) { |
219 | mutex_lock(&p->mutex); |
220 | |
221 | pqm = &p->pqm; |
222 | list_for_each_entry(pqn, &pqm->queues, |
223 | process_queue_list) { |
224 | |
225 | if (!(pqn->q && pqn->q->device == dev && |
226 | pqn->q->doorbell_id == doorbell_id)) |
227 | continue; |
228 | |
229 | kfd_send_exception_to_runtime(p, queue_id: pqn->q->properties.queue_id, |
230 | error_reason: trap_mask); |
231 | |
232 | signaled_to_debugger_or_runtime = true; |
233 | |
234 | break; |
235 | } |
236 | |
237 | mutex_unlock(lock: &p->mutex); |
238 | } else if (trap_mask & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { |
239 | kfd_dqm_evict_pasid(dqm: dev->dqm, pasid: p->pasid); |
240 | kfd_signal_vm_fault_event(dev, pasid: p->pasid, NULL, |
241 | data: exception_data); |
242 | |
243 | signaled_to_debugger_or_runtime = true; |
244 | } |
245 | } else { |
246 | signaled_to_debugger_or_runtime = true; |
247 | } |
248 | |
249 | kfd_unref_process(p); |
250 | |
251 | return signaled_to_debugger_or_runtime; |
252 | } |
253 | |
254 | int kfd_dbg_send_exception_to_runtime(struct kfd_process *p, |
255 | unsigned int dev_id, |
256 | unsigned int queue_id, |
257 | uint64_t error_reason) |
258 | { |
259 | if (error_reason & KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) { |
260 | struct kfd_process_device *pdd = NULL; |
261 | struct kfd_hsa_memory_exception_data *data; |
262 | int i; |
263 | |
264 | for (i = 0; i < p->n_pdds; i++) { |
265 | if (p->pdds[i]->dev->id == dev_id) { |
266 | pdd = p->pdds[i]; |
267 | break; |
268 | } |
269 | } |
270 | |
271 | if (!pdd) |
272 | return -ENODEV; |
273 | |
274 | data = (struct kfd_hsa_memory_exception_data *) |
275 | pdd->vm_fault_exc_data; |
276 | |
277 | kfd_dqm_evict_pasid(dqm: pdd->dev->dqm, pasid: p->pasid); |
278 | kfd_signal_vm_fault_event(dev: pdd->dev, pasid: p->pasid, NULL, data); |
279 | error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION); |
280 | } |
281 | |
282 | if (error_reason & (KFD_EC_MASK(EC_PROCESS_RUNTIME))) { |
283 | /* |
284 | * block should only happen after the debugger receives runtime |
285 | * enable notice. |
286 | */ |
287 | up(sem: &p->runtime_enable_sema); |
288 | error_reason &= ~KFD_EC_MASK(EC_PROCESS_RUNTIME); |
289 | } |
290 | |
291 | if (error_reason) |
292 | return kfd_send_exception_to_runtime(p, queue_id, error_reason); |
293 | |
294 | return 0; |
295 | } |
296 | |
297 | static int kfd_dbg_set_queue_workaround(struct queue *q, bool enable) |
298 | { |
299 | struct mqd_update_info minfo = {0}; |
300 | int err; |
301 | |
302 | if (!q) |
303 | return 0; |
304 | |
305 | if (!kfd_dbg_has_cwsr_workaround(dev: q->device)) |
306 | return 0; |
307 | |
308 | if (enable && q->properties.is_user_cu_masked) |
309 | return -EBUSY; |
310 | |
311 | minfo.update_flag = enable ? UPDATE_FLAG_DBG_WA_ENABLE : UPDATE_FLAG_DBG_WA_DISABLE; |
312 | |
313 | q->properties.is_dbg_wa = enable; |
314 | err = q->device->dqm->ops.update_queue(q->device->dqm, q, &minfo); |
315 | if (err) |
316 | q->properties.is_dbg_wa = false; |
317 | |
318 | return err; |
319 | } |
320 | |
321 | static int kfd_dbg_set_workaround(struct kfd_process *target, bool enable) |
322 | { |
323 | struct process_queue_manager *pqm = &target->pqm; |
324 | struct process_queue_node *pqn; |
325 | int r = 0; |
326 | |
327 | list_for_each_entry(pqn, &pqm->queues, process_queue_list) { |
328 | r = kfd_dbg_set_queue_workaround(q: pqn->q, enable); |
329 | if (enable && r) |
330 | goto unwind; |
331 | } |
332 | |
333 | return 0; |
334 | |
335 | unwind: |
336 | list_for_each_entry(pqn, &pqm->queues, process_queue_list) |
337 | kfd_dbg_set_queue_workaround(q: pqn->q, enable: false); |
338 | |
339 | if (enable) |
340 | target->runtime_info.runtime_state = r == -EBUSY ? |
341 | DEBUG_RUNTIME_STATE_ENABLED_BUSY : |
342 | DEBUG_RUNTIME_STATE_ENABLED_ERROR; |
343 | |
344 | return r; |
345 | } |
346 | |
347 | int kfd_dbg_set_mes_debug_mode(struct kfd_process_device *pdd, bool sq_trap_en) |
348 | { |
349 | uint32_t spi_dbg_cntl = pdd->spi_dbg_override | pdd->spi_dbg_launch_mode; |
350 | uint32_t flags = pdd->process->dbg_flags; |
351 | |
352 | if (!kfd_dbg_is_per_vmid_supported(dev: pdd->dev)) |
353 | return 0; |
354 | |
355 | return amdgpu_mes_set_shader_debugger(adev: pdd->dev->adev, process_context_addr: pdd->proc_ctx_gpu_addr, spi_gdbg_per_vmid_cntl: spi_dbg_cntl, |
356 | tcp_watch_cntl: pdd->watch_points, flags, trap_en: sq_trap_en); |
357 | } |
358 | |
359 | #define KFD_DEBUGGER_INVALID_WATCH_POINT_ID -1 |
360 | static int kfd_dbg_get_dev_watch_id(struct kfd_process_device *pdd, int *watch_id) |
361 | { |
362 | int i; |
363 | |
364 | *watch_id = KFD_DEBUGGER_INVALID_WATCH_POINT_ID; |
365 | |
366 | spin_lock(lock: &pdd->dev->kfd->watch_points_lock); |
367 | |
368 | for (i = 0; i < MAX_WATCH_ADDRESSES; i++) { |
369 | /* device watchpoint in use so skip */ |
370 | if ((pdd->dev->kfd->alloc_watch_ids >> i) & 0x1) |
371 | continue; |
372 | |
373 | pdd->alloc_watch_ids |= 0x1 << i; |
374 | pdd->dev->kfd->alloc_watch_ids |= 0x1 << i; |
375 | *watch_id = i; |
376 | spin_unlock(lock: &pdd->dev->kfd->watch_points_lock); |
377 | return 0; |
378 | } |
379 | |
380 | spin_unlock(lock: &pdd->dev->kfd->watch_points_lock); |
381 | |
382 | return -ENOMEM; |
383 | } |
384 | |
385 | static void kfd_dbg_clear_dev_watch_id(struct kfd_process_device *pdd, int watch_id) |
386 | { |
387 | spin_lock(lock: &pdd->dev->kfd->watch_points_lock); |
388 | |
389 | /* process owns device watch point so safe to clear */ |
390 | if ((pdd->alloc_watch_ids >> watch_id) & 0x1) { |
391 | pdd->alloc_watch_ids &= ~(0x1 << watch_id); |
392 | pdd->dev->kfd->alloc_watch_ids &= ~(0x1 << watch_id); |
393 | } |
394 | |
395 | spin_unlock(lock: &pdd->dev->kfd->watch_points_lock); |
396 | } |
397 | |
398 | static bool kfd_dbg_owns_dev_watch_id(struct kfd_process_device *pdd, int watch_id) |
399 | { |
400 | bool owns_watch_id = false; |
401 | |
402 | spin_lock(lock: &pdd->dev->kfd->watch_points_lock); |
403 | owns_watch_id = watch_id < MAX_WATCH_ADDRESSES && |
404 | ((pdd->alloc_watch_ids >> watch_id) & 0x1); |
405 | |
406 | spin_unlock(lock: &pdd->dev->kfd->watch_points_lock); |
407 | |
408 | return owns_watch_id; |
409 | } |
410 | |
411 | int kfd_dbg_trap_clear_dev_address_watch(struct kfd_process_device *pdd, |
412 | uint32_t watch_id) |
413 | { |
414 | int r; |
415 | |
416 | if (!kfd_dbg_owns_dev_watch_id(pdd, watch_id)) |
417 | return -EINVAL; |
418 | |
419 | if (!pdd->dev->kfd->shared_resources.enable_mes) { |
420 | r = debug_lock_and_unmap(dqm: pdd->dev->dqm); |
421 | if (r) |
422 | return r; |
423 | } |
424 | |
425 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
426 | pdd->watch_points[watch_id] = pdd->dev->kfd2kgd->clear_address_watch( |
427 | pdd->dev->adev, |
428 | watch_id); |
429 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
430 | |
431 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
432 | r = debug_map_and_unlock(dqm: pdd->dev->dqm); |
433 | else |
434 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
435 | |
436 | kfd_dbg_clear_dev_watch_id(pdd, watch_id); |
437 | |
438 | return r; |
439 | } |
440 | |
441 | int kfd_dbg_trap_set_dev_address_watch(struct kfd_process_device *pdd, |
442 | uint64_t watch_address, |
443 | uint32_t watch_address_mask, |
444 | uint32_t *watch_id, |
445 | uint32_t watch_mode) |
446 | { |
447 | int xcc_id, r = kfd_dbg_get_dev_watch_id(pdd, watch_id); |
448 | uint32_t xcc_mask = pdd->dev->xcc_mask; |
449 | |
450 | if (r) |
451 | return r; |
452 | |
453 | if (!pdd->dev->kfd->shared_resources.enable_mes) { |
454 | r = debug_lock_and_unmap(dqm: pdd->dev->dqm); |
455 | if (r) { |
456 | kfd_dbg_clear_dev_watch_id(pdd, watch_id: *watch_id); |
457 | return r; |
458 | } |
459 | } |
460 | |
461 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
462 | for_each_inst(xcc_id, xcc_mask) |
463 | pdd->watch_points[*watch_id] = pdd->dev->kfd2kgd->set_address_watch( |
464 | pdd->dev->adev, |
465 | watch_address, |
466 | watch_address_mask, |
467 | *watch_id, |
468 | watch_mode, |
469 | pdd->dev->vm_info.last_vmid_kfd, |
470 | xcc_id); |
471 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
472 | |
473 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
474 | r = debug_map_and_unlock(dqm: pdd->dev->dqm); |
475 | else |
476 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
477 | |
478 | /* HWS is broken so no point in HW rollback but release the watchpoint anyways */ |
479 | if (r) |
480 | kfd_dbg_clear_dev_watch_id(pdd, watch_id: *watch_id); |
481 | |
482 | return 0; |
483 | } |
484 | |
485 | static void kfd_dbg_clear_process_address_watch(struct kfd_process *target) |
486 | { |
487 | int i, j; |
488 | |
489 | for (i = 0; i < target->n_pdds; i++) |
490 | for (j = 0; j < MAX_WATCH_ADDRESSES; j++) |
491 | kfd_dbg_trap_clear_dev_address_watch(pdd: target->pdds[i], watch_id: j); |
492 | } |
493 | |
494 | int kfd_dbg_trap_set_flags(struct kfd_process *target, uint32_t *flags) |
495 | { |
496 | uint32_t prev_flags = target->dbg_flags; |
497 | int i, r = 0, rewind_count = 0; |
498 | |
499 | for (i = 0; i < target->n_pdds; i++) { |
500 | if (!kfd_dbg_is_per_vmid_supported(dev: target->pdds[i]->dev) && |
501 | (*flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP)) { |
502 | *flags = prev_flags; |
503 | return -EACCES; |
504 | } |
505 | } |
506 | |
507 | target->dbg_flags = *flags & KFD_DBG_TRAP_FLAG_SINGLE_MEM_OP; |
508 | *flags = prev_flags; |
509 | for (i = 0; i < target->n_pdds; i++) { |
510 | struct kfd_process_device *pdd = target->pdds[i]; |
511 | |
512 | if (!kfd_dbg_is_per_vmid_supported(dev: pdd->dev)) |
513 | continue; |
514 | |
515 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
516 | r = debug_refresh_runlist(dqm: pdd->dev->dqm); |
517 | else |
518 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
519 | |
520 | if (r) { |
521 | target->dbg_flags = prev_flags; |
522 | break; |
523 | } |
524 | |
525 | rewind_count++; |
526 | } |
527 | |
528 | /* Rewind flags */ |
529 | if (r) { |
530 | target->dbg_flags = prev_flags; |
531 | |
532 | for (i = 0; i < rewind_count; i++) { |
533 | struct kfd_process_device *pdd = target->pdds[i]; |
534 | |
535 | if (!kfd_dbg_is_per_vmid_supported(dev: pdd->dev)) |
536 | continue; |
537 | |
538 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
539 | debug_refresh_runlist(dqm: pdd->dev->dqm); |
540 | else |
541 | kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
542 | } |
543 | } |
544 | |
545 | return r; |
546 | } |
547 | |
548 | /* kfd_dbg_trap_deactivate: |
549 | * target: target process |
550 | * unwind: If this is unwinding a failed kfd_dbg_trap_enable() |
551 | * unwind_count: |
552 | * If unwind == true, how far down the pdd list we need |
553 | * to unwind |
554 | * else: ignored |
555 | */ |
556 | void kfd_dbg_trap_deactivate(struct kfd_process *target, bool unwind, int unwind_count) |
557 | { |
558 | int i; |
559 | |
560 | if (!unwind) { |
561 | uint32_t flags = 0; |
562 | int resume_count = resume_queues(p: target, num_queues: 0, NULL); |
563 | |
564 | if (resume_count) |
565 | pr_debug("Resumed %d queues\n" , resume_count); |
566 | |
567 | cancel_work_sync(work: &target->debug_event_workarea); |
568 | kfd_dbg_clear_process_address_watch(target); |
569 | kfd_dbg_trap_set_wave_launch_mode(target, wave_launch_mode: 0); |
570 | |
571 | kfd_dbg_trap_set_flags(target, flags: &flags); |
572 | } |
573 | |
574 | for (i = 0; i < target->n_pdds; i++) { |
575 | struct kfd_process_device *pdd = target->pdds[i]; |
576 | |
577 | /* If this is an unwind, and we have unwound the required |
578 | * enable calls on the pdd list, we need to stop now |
579 | * otherwise we may mess up another debugger session. |
580 | */ |
581 | if (unwind && i == unwind_count) |
582 | break; |
583 | |
584 | kfd_process_set_trap_debug_flag(qpd: &pdd->qpd, enabled: false); |
585 | |
586 | /* GFX off is already disabled by debug activate if not RLC restore supported. */ |
587 | if (kfd_dbg_is_rlc_restore_supported(dev: pdd->dev)) |
588 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
589 | pdd->spi_dbg_override = |
590 | pdd->dev->kfd2kgd->disable_debug_trap( |
591 | pdd->dev->adev, |
592 | target->runtime_info.ttmp_setup, |
593 | pdd->dev->vm_info.last_vmid_kfd); |
594 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
595 | |
596 | if (!kfd_dbg_is_per_vmid_supported(dev: pdd->dev) && |
597 | release_debug_trap_vmid(dqm: pdd->dev->dqm, qpd: &pdd->qpd)) |
598 | pr_err("Failed to release debug vmid on [%i]\n" , pdd->dev->id); |
599 | |
600 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
601 | debug_refresh_runlist(dqm: pdd->dev->dqm); |
602 | else |
603 | kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: !kfd_dbg_has_cwsr_workaround(dev: pdd->dev)); |
604 | } |
605 | |
606 | kfd_dbg_set_workaround(target, enable: false); |
607 | } |
608 | |
609 | static void kfd_dbg_clean_exception_status(struct kfd_process *target) |
610 | { |
611 | struct process_queue_manager *pqm; |
612 | struct process_queue_node *pqn; |
613 | int i; |
614 | |
615 | for (i = 0; i < target->n_pdds; i++) { |
616 | struct kfd_process_device *pdd = target->pdds[i]; |
617 | |
618 | kfd_process_drain_interrupts(pdd); |
619 | |
620 | pdd->exception_status = 0; |
621 | } |
622 | |
623 | pqm = &target->pqm; |
624 | list_for_each_entry(pqn, &pqm->queues, process_queue_list) { |
625 | if (!pqn->q) |
626 | continue; |
627 | |
628 | pqn->q->properties.exception_status = 0; |
629 | } |
630 | |
631 | target->exception_status = 0; |
632 | } |
633 | |
634 | int kfd_dbg_trap_disable(struct kfd_process *target) |
635 | { |
636 | if (!target->debug_trap_enabled) |
637 | return 0; |
638 | |
639 | /* |
640 | * Defer deactivation to runtime if runtime not enabled otherwise reset |
641 | * attached running target runtime state to enable for re-attach. |
642 | */ |
643 | if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) |
644 | kfd_dbg_trap_deactivate(target, unwind: false, unwind_count: 0); |
645 | else if (target->runtime_info.runtime_state != DEBUG_RUNTIME_STATE_DISABLED) |
646 | target->runtime_info.runtime_state = DEBUG_RUNTIME_STATE_ENABLED; |
647 | |
648 | fput(target->dbg_ev_file); |
649 | target->dbg_ev_file = NULL; |
650 | |
651 | if (target->debugger_process) { |
652 | atomic_dec(v: &target->debugger_process->debugged_process_count); |
653 | target->debugger_process = NULL; |
654 | } |
655 | |
656 | target->debug_trap_enabled = false; |
657 | kfd_dbg_clean_exception_status(target); |
658 | kfd_unref_process(p: target); |
659 | |
660 | return 0; |
661 | } |
662 | |
663 | int kfd_dbg_trap_activate(struct kfd_process *target) |
664 | { |
665 | int i, r = 0; |
666 | |
667 | r = kfd_dbg_set_workaround(target, enable: true); |
668 | if (r) |
669 | return r; |
670 | |
671 | for (i = 0; i < target->n_pdds; i++) { |
672 | struct kfd_process_device *pdd = target->pdds[i]; |
673 | |
674 | if (!kfd_dbg_is_per_vmid_supported(dev: pdd->dev)) { |
675 | r = reserve_debug_trap_vmid(dqm: pdd->dev->dqm, qpd: &pdd->qpd); |
676 | |
677 | if (r) { |
678 | target->runtime_info.runtime_state = (r == -EBUSY) ? |
679 | DEBUG_RUNTIME_STATE_ENABLED_BUSY : |
680 | DEBUG_RUNTIME_STATE_ENABLED_ERROR; |
681 | |
682 | goto unwind_err; |
683 | } |
684 | } |
685 | |
686 | /* Disable GFX OFF to prevent garbage read/writes to debug registers. |
687 | * If RLC restore of debug registers is not supported and runtime enable |
688 | * hasn't done so already on ttmp setup request, restore the trap config registers. |
689 | * |
690 | * If RLC restore of debug registers is not supported, keep gfx off disabled for |
691 | * the debug session. |
692 | */ |
693 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
694 | if (!(kfd_dbg_is_rlc_restore_supported(dev: pdd->dev) || |
695 | target->runtime_info.ttmp_setup)) |
696 | pdd->dev->kfd2kgd->enable_debug_trap(pdd->dev->adev, true, |
697 | pdd->dev->vm_info.last_vmid_kfd); |
698 | |
699 | pdd->spi_dbg_override = pdd->dev->kfd2kgd->enable_debug_trap( |
700 | pdd->dev->adev, |
701 | false, |
702 | pdd->dev->vm_info.last_vmid_kfd); |
703 | |
704 | if (kfd_dbg_is_rlc_restore_supported(dev: pdd->dev)) |
705 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
706 | |
707 | /* |
708 | * Setting the debug flag in the trap handler requires that the TMA has been |
709 | * allocated, which occurs during CWSR initialization. |
710 | * In the event that CWSR has not been initialized at this point, setting the |
711 | * flag will be called again during CWSR initialization if the target process |
712 | * is still debug enabled. |
713 | */ |
714 | kfd_process_set_trap_debug_flag(qpd: &pdd->qpd, enabled: true); |
715 | |
716 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
717 | r = debug_refresh_runlist(dqm: pdd->dev->dqm); |
718 | else |
719 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
720 | |
721 | if (r) { |
722 | target->runtime_info.runtime_state = |
723 | DEBUG_RUNTIME_STATE_ENABLED_ERROR; |
724 | goto unwind_err; |
725 | } |
726 | } |
727 | |
728 | return 0; |
729 | |
730 | unwind_err: |
731 | /* Enabling debug failed, we need to disable on |
732 | * all GPUs so the enable is all or nothing. |
733 | */ |
734 | kfd_dbg_trap_deactivate(target, unwind: true, unwind_count: i); |
735 | return r; |
736 | } |
737 | |
738 | int kfd_dbg_trap_enable(struct kfd_process *target, uint32_t fd, |
739 | void __user *runtime_info, uint32_t *runtime_size) |
740 | { |
741 | struct file *f; |
742 | uint32_t copy_size; |
743 | int i, r = 0; |
744 | |
745 | if (target->debug_trap_enabled) |
746 | return -EALREADY; |
747 | |
748 | /* Enable pre-checks */ |
749 | for (i = 0; i < target->n_pdds; i++) { |
750 | struct kfd_process_device *pdd = target->pdds[i]; |
751 | |
752 | if (!KFD_IS_SOC15(pdd->dev)) |
753 | return -ENODEV; |
754 | |
755 | if (pdd->qpd.num_gws && (!kfd_dbg_has_gws_support(dev: pdd->dev) || |
756 | kfd_dbg_has_cwsr_workaround(dev: pdd->dev))) |
757 | return -EBUSY; |
758 | } |
759 | |
760 | copy_size = min((size_t)(*runtime_size), sizeof(target->runtime_info)); |
761 | |
762 | f = fget(fd); |
763 | if (!f) { |
764 | pr_err("Failed to get file for (%i)\n" , fd); |
765 | return -EBADF; |
766 | } |
767 | |
768 | target->dbg_ev_file = f; |
769 | |
770 | /* defer activation to runtime if not runtime enabled */ |
771 | if (target->runtime_info.runtime_state == DEBUG_RUNTIME_STATE_ENABLED) |
772 | kfd_dbg_trap_activate(target); |
773 | |
774 | /* We already hold the process reference but hold another one for the |
775 | * debug session. |
776 | */ |
777 | kref_get(kref: &target->ref); |
778 | target->debug_trap_enabled = true; |
779 | |
780 | if (target->debugger_process) |
781 | atomic_inc(v: &target->debugger_process->debugged_process_count); |
782 | |
783 | if (copy_to_user(to: runtime_info, from: (void *)&target->runtime_info, n: copy_size)) { |
784 | kfd_dbg_trap_deactivate(target, unwind: false, unwind_count: 0); |
785 | r = -EFAULT; |
786 | } |
787 | |
788 | *runtime_size = sizeof(target->runtime_info); |
789 | |
790 | return r; |
791 | } |
792 | |
793 | static int kfd_dbg_validate_trap_override_request(struct kfd_process *p, |
794 | uint32_t trap_override, |
795 | uint32_t trap_mask_request, |
796 | uint32_t *trap_mask_supported) |
797 | { |
798 | int i = 0; |
799 | |
800 | *trap_mask_supported = 0xffffffff; |
801 | |
802 | for (i = 0; i < p->n_pdds; i++) { |
803 | struct kfd_process_device *pdd = p->pdds[i]; |
804 | int err = pdd->dev->kfd2kgd->validate_trap_override_request( |
805 | pdd->dev->adev, |
806 | trap_override, |
807 | trap_mask_supported); |
808 | |
809 | if (err) |
810 | return err; |
811 | } |
812 | |
813 | if (trap_mask_request & ~*trap_mask_supported) |
814 | return -EACCES; |
815 | |
816 | return 0; |
817 | } |
818 | |
819 | int kfd_dbg_trap_set_wave_launch_override(struct kfd_process *target, |
820 | uint32_t trap_override, |
821 | uint32_t trap_mask_bits, |
822 | uint32_t trap_mask_request, |
823 | uint32_t *trap_mask_prev, |
824 | uint32_t *trap_mask_supported) |
825 | { |
826 | int r = 0, i; |
827 | |
828 | r = kfd_dbg_validate_trap_override_request(p: target, |
829 | trap_override, |
830 | trap_mask_request, |
831 | trap_mask_supported); |
832 | |
833 | if (r) |
834 | return r; |
835 | |
836 | for (i = 0; i < target->n_pdds; i++) { |
837 | struct kfd_process_device *pdd = target->pdds[i]; |
838 | |
839 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
840 | pdd->spi_dbg_override = pdd->dev->kfd2kgd->set_wave_launch_trap_override( |
841 | pdd->dev->adev, |
842 | pdd->dev->vm_info.last_vmid_kfd, |
843 | trap_override, |
844 | trap_mask_bits, |
845 | trap_mask_request, |
846 | trap_mask_prev, |
847 | pdd->spi_dbg_override); |
848 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
849 | |
850 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
851 | r = debug_refresh_runlist(dqm: pdd->dev->dqm); |
852 | else |
853 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
854 | |
855 | if (r) |
856 | break; |
857 | } |
858 | |
859 | return r; |
860 | } |
861 | |
862 | int kfd_dbg_trap_set_wave_launch_mode(struct kfd_process *target, |
863 | uint8_t wave_launch_mode) |
864 | { |
865 | int r = 0, i; |
866 | |
867 | if (wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_NORMAL && |
868 | wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_HALT && |
869 | wave_launch_mode != KFD_DBG_TRAP_WAVE_LAUNCH_MODE_DEBUG) |
870 | return -EINVAL; |
871 | |
872 | for (i = 0; i < target->n_pdds; i++) { |
873 | struct kfd_process_device *pdd = target->pdds[i]; |
874 | |
875 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: false); |
876 | pdd->spi_dbg_launch_mode = pdd->dev->kfd2kgd->set_wave_launch_mode( |
877 | pdd->dev->adev, |
878 | wave_launch_mode, |
879 | pdd->dev->vm_info.last_vmid_kfd); |
880 | amdgpu_gfx_off_ctrl(adev: pdd->dev->adev, enable: true); |
881 | |
882 | if (!pdd->dev->kfd->shared_resources.enable_mes) |
883 | r = debug_refresh_runlist(dqm: pdd->dev->dqm); |
884 | else |
885 | r = kfd_dbg_set_mes_debug_mode(pdd, sq_trap_en: true); |
886 | |
887 | if (r) |
888 | break; |
889 | } |
890 | |
891 | return r; |
892 | } |
893 | |
894 | int kfd_dbg_trap_query_exception_info(struct kfd_process *target, |
895 | uint32_t source_id, |
896 | uint32_t exception_code, |
897 | bool clear_exception, |
898 | void __user *info, |
899 | uint32_t *info_size) |
900 | { |
901 | bool found = false; |
902 | int r = 0; |
903 | uint32_t copy_size, actual_info_size = 0; |
904 | uint64_t *exception_status_ptr = NULL; |
905 | |
906 | if (!target) |
907 | return -EINVAL; |
908 | |
909 | if (!info || !info_size) |
910 | return -EINVAL; |
911 | |
912 | mutex_lock(&target->event_mutex); |
913 | |
914 | if (KFD_DBG_EC_TYPE_IS_QUEUE(exception_code)) { |
915 | /* Per queue exceptions */ |
916 | struct queue *queue = NULL; |
917 | int i; |
918 | |
919 | for (i = 0; i < target->n_pdds; i++) { |
920 | struct kfd_process_device *pdd = target->pdds[i]; |
921 | struct qcm_process_device *qpd = &pdd->qpd; |
922 | |
923 | list_for_each_entry(queue, &qpd->queues_list, list) { |
924 | if (!found && queue->properties.queue_id == source_id) { |
925 | found = true; |
926 | break; |
927 | } |
928 | } |
929 | if (found) |
930 | break; |
931 | } |
932 | |
933 | if (!found) { |
934 | r = -EINVAL; |
935 | goto out; |
936 | } |
937 | |
938 | if (!(queue->properties.exception_status & KFD_EC_MASK(exception_code))) { |
939 | r = -ENODATA; |
940 | goto out; |
941 | } |
942 | exception_status_ptr = &queue->properties.exception_status; |
943 | } else if (KFD_DBG_EC_TYPE_IS_DEVICE(exception_code)) { |
944 | /* Per device exceptions */ |
945 | struct kfd_process_device *pdd = NULL; |
946 | int i; |
947 | |
948 | for (i = 0; i < target->n_pdds; i++) { |
949 | pdd = target->pdds[i]; |
950 | if (pdd->dev->id == source_id) { |
951 | found = true; |
952 | break; |
953 | } |
954 | } |
955 | |
956 | if (!found) { |
957 | r = -EINVAL; |
958 | goto out; |
959 | } |
960 | |
961 | if (!(pdd->exception_status & KFD_EC_MASK(exception_code))) { |
962 | r = -ENODATA; |
963 | goto out; |
964 | } |
965 | |
966 | if (exception_code == EC_DEVICE_MEMORY_VIOLATION) { |
967 | copy_size = min((size_t)(*info_size), pdd->vm_fault_exc_data_size); |
968 | |
969 | if (copy_to_user(to: info, from: pdd->vm_fault_exc_data, n: copy_size)) { |
970 | r = -EFAULT; |
971 | goto out; |
972 | } |
973 | actual_info_size = pdd->vm_fault_exc_data_size; |
974 | if (clear_exception) { |
975 | kfree(objp: pdd->vm_fault_exc_data); |
976 | pdd->vm_fault_exc_data = NULL; |
977 | pdd->vm_fault_exc_data_size = 0; |
978 | } |
979 | } |
980 | exception_status_ptr = &pdd->exception_status; |
981 | } else if (KFD_DBG_EC_TYPE_IS_PROCESS(exception_code)) { |
982 | /* Per process exceptions */ |
983 | if (!(target->exception_status & KFD_EC_MASK(exception_code))) { |
984 | r = -ENODATA; |
985 | goto out; |
986 | } |
987 | |
988 | if (exception_code == EC_PROCESS_RUNTIME) { |
989 | copy_size = min((size_t)(*info_size), sizeof(target->runtime_info)); |
990 | |
991 | if (copy_to_user(to: info, from: (void *)&target->runtime_info, n: copy_size)) { |
992 | r = -EFAULT; |
993 | goto out; |
994 | } |
995 | |
996 | actual_info_size = sizeof(target->runtime_info); |
997 | } |
998 | |
999 | exception_status_ptr = &target->exception_status; |
1000 | } else { |
1001 | pr_debug("Bad exception type [%i]\n" , exception_code); |
1002 | r = -EINVAL; |
1003 | goto out; |
1004 | } |
1005 | |
1006 | *info_size = actual_info_size; |
1007 | if (clear_exception) |
1008 | *exception_status_ptr &= ~KFD_EC_MASK(exception_code); |
1009 | out: |
1010 | mutex_unlock(lock: &target->event_mutex); |
1011 | return r; |
1012 | } |
1013 | |
1014 | int kfd_dbg_trap_device_snapshot(struct kfd_process *target, |
1015 | uint64_t exception_clear_mask, |
1016 | void __user *user_info, |
1017 | uint32_t *number_of_device_infos, |
1018 | uint32_t *entry_size) |
1019 | { |
1020 | struct kfd_dbg_device_info_entry device_info; |
1021 | uint32_t tmp_entry_size, tmp_num_devices; |
1022 | int i, r = 0; |
1023 | |
1024 | if (!(target && user_info && number_of_device_infos && entry_size)) |
1025 | return -EINVAL; |
1026 | |
1027 | tmp_entry_size = *entry_size; |
1028 | |
1029 | tmp_num_devices = min_t(size_t, *number_of_device_infos, target->n_pdds); |
1030 | *number_of_device_infos = target->n_pdds; |
1031 | *entry_size = min_t(size_t, *entry_size, sizeof(device_info)); |
1032 | |
1033 | if (!tmp_num_devices) |
1034 | return 0; |
1035 | |
1036 | memset(&device_info, 0, sizeof(device_info)); |
1037 | |
1038 | mutex_lock(&target->event_mutex); |
1039 | |
1040 | /* Run over all pdd of the process */ |
1041 | for (i = 0; i < tmp_num_devices; i++) { |
1042 | struct kfd_process_device *pdd = target->pdds[i]; |
1043 | struct kfd_topology_device *topo_dev = kfd_topology_device_by_id(gpu_id: pdd->dev->id); |
1044 | |
1045 | device_info.gpu_id = pdd->dev->id; |
1046 | device_info.exception_status = pdd->exception_status; |
1047 | device_info.lds_base = pdd->lds_base; |
1048 | device_info.lds_limit = pdd->lds_limit; |
1049 | device_info.scratch_base = pdd->scratch_base; |
1050 | device_info.scratch_limit = pdd->scratch_limit; |
1051 | device_info.gpuvm_base = pdd->gpuvm_base; |
1052 | device_info.gpuvm_limit = pdd->gpuvm_limit; |
1053 | device_info.location_id = topo_dev->node_props.location_id; |
1054 | device_info.vendor_id = topo_dev->node_props.vendor_id; |
1055 | device_info.device_id = topo_dev->node_props.device_id; |
1056 | device_info.revision_id = pdd->dev->adev->pdev->revision; |
1057 | device_info.subsystem_vendor_id = pdd->dev->adev->pdev->subsystem_vendor; |
1058 | device_info.subsystem_device_id = pdd->dev->adev->pdev->subsystem_device; |
1059 | device_info.fw_version = pdd->dev->kfd->mec_fw_version; |
1060 | device_info.gfx_target_version = |
1061 | topo_dev->node_props.gfx_target_version; |
1062 | device_info.simd_count = topo_dev->node_props.simd_count; |
1063 | device_info.max_waves_per_simd = |
1064 | topo_dev->node_props.max_waves_per_simd; |
1065 | device_info.array_count = topo_dev->node_props.array_count; |
1066 | device_info.simd_arrays_per_engine = |
1067 | topo_dev->node_props.simd_arrays_per_engine; |
1068 | device_info.num_xcc = NUM_XCC(pdd->dev->xcc_mask); |
1069 | device_info.capability = topo_dev->node_props.capability; |
1070 | device_info.debug_prop = topo_dev->node_props.debug_prop; |
1071 | |
1072 | if (exception_clear_mask) |
1073 | pdd->exception_status &= ~exception_clear_mask; |
1074 | |
1075 | if (copy_to_user(to: user_info, from: &device_info, n: *entry_size)) { |
1076 | r = -EFAULT; |
1077 | break; |
1078 | } |
1079 | |
1080 | user_info += tmp_entry_size; |
1081 | } |
1082 | |
1083 | mutex_unlock(lock: &target->event_mutex); |
1084 | |
1085 | return r; |
1086 | } |
1087 | |
1088 | void kfd_dbg_set_enabled_debug_exception_mask(struct kfd_process *target, |
1089 | uint64_t exception_set_mask) |
1090 | { |
1091 | uint64_t found_mask = 0; |
1092 | struct process_queue_manager *pqm; |
1093 | struct process_queue_node *pqn; |
1094 | static const char write_data = '.'; |
1095 | loff_t pos = 0; |
1096 | int i; |
1097 | |
1098 | mutex_lock(&target->event_mutex); |
1099 | |
1100 | found_mask |= target->exception_status; |
1101 | |
1102 | pqm = &target->pqm; |
1103 | list_for_each_entry(pqn, &pqm->queues, process_queue_list) { |
1104 | if (!pqn->q) |
1105 | continue; |
1106 | |
1107 | found_mask |= pqn->q->properties.exception_status; |
1108 | } |
1109 | |
1110 | for (i = 0; i < target->n_pdds; i++) { |
1111 | struct kfd_process_device *pdd = target->pdds[i]; |
1112 | |
1113 | found_mask |= pdd->exception_status; |
1114 | } |
1115 | |
1116 | if (exception_set_mask & found_mask) |
1117 | kernel_write(target->dbg_ev_file, &write_data, 1, &pos); |
1118 | |
1119 | target->exception_enable_mask = exception_set_mask; |
1120 | |
1121 | mutex_unlock(lock: &target->event_mutex); |
1122 | } |
1123 | |