1 | /* |
2 | * Copyright (c) 2008 Intel Corporation |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice (including the next |
12 | * paragraph) shall be included in all copies or substantial portions of the |
13 | * Software. |
14 | * |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
21 | * IN THE SOFTWARE. |
22 | * |
23 | * Authors: |
24 | * Eric Anholt <eric@anholt.net> |
25 | * Keith Packard <keithp@keithp.com> |
26 | * Mika Kuoppala <mika.kuoppala@intel.com> |
27 | * |
28 | */ |
29 | |
30 | #include <linux/ascii85.h> |
31 | #include <linux/highmem.h> |
32 | #include <linux/nmi.h> |
33 | #include <linux/pagevec.h> |
34 | #include <linux/scatterlist.h> |
35 | #include <linux/string_helpers.h> |
36 | #include <linux/utsname.h> |
37 | #include <linux/zlib.h> |
38 | |
39 | #include <drm/drm_cache.h> |
40 | #include <drm/drm_print.h> |
41 | |
42 | #include "display/intel_dmc.h" |
43 | #include "display/intel_overlay.h" |
44 | |
45 | #include "gem/i915_gem_context.h" |
46 | #include "gem/i915_gem_lmem.h" |
47 | #include "gt/intel_engine_regs.h" |
48 | #include "gt/intel_gt.h" |
49 | #include "gt/intel_gt_mcr.h" |
50 | #include "gt/intel_gt_pm.h" |
51 | #include "gt/intel_gt_regs.h" |
52 | #include "gt/uc/intel_guc_capture.h" |
53 | |
54 | #include "i915_driver.h" |
55 | #include "i915_drv.h" |
56 | #include "i915_gpu_error.h" |
57 | #include "i915_memcpy.h" |
58 | #include "i915_reg.h" |
59 | #include "i915_scatterlist.h" |
60 | #include "i915_sysfs.h" |
61 | #include "i915_utils.h" |
62 | |
63 | #define ALLOW_FAIL (__GFP_KSWAPD_RECLAIM | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) |
64 | #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) |
65 | |
66 | static void __sg_set_buf(struct scatterlist *sg, |
67 | void *addr, unsigned int len, loff_t it) |
68 | { |
69 | sg->page_link = (unsigned long)virt_to_page(addr); |
70 | sg->offset = offset_in_page(addr); |
71 | sg->length = len; |
72 | sg->dma_address = it; |
73 | } |
74 | |
75 | static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) |
76 | { |
77 | if (!len) |
78 | return false; |
79 | |
80 | if (e->bytes + len + 1 <= e->size) |
81 | return true; |
82 | |
83 | if (e->bytes) { |
84 | __sg_set_buf(sg: e->cur++, addr: e->buf, len: e->bytes, it: e->iter); |
85 | e->iter += e->bytes; |
86 | e->buf = NULL; |
87 | e->bytes = 0; |
88 | } |
89 | |
90 | if (e->cur == e->end) { |
91 | struct scatterlist *sgl; |
92 | |
93 | sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); |
94 | if (!sgl) { |
95 | e->err = -ENOMEM; |
96 | return false; |
97 | } |
98 | |
99 | if (e->cur) { |
100 | e->cur->offset = 0; |
101 | e->cur->length = 0; |
102 | e->cur->page_link = |
103 | (unsigned long)sgl | SG_CHAIN; |
104 | } else { |
105 | e->sgl = sgl; |
106 | } |
107 | |
108 | e->cur = sgl; |
109 | e->end = sgl + SG_MAX_SINGLE_ALLOC - 1; |
110 | } |
111 | |
112 | e->size = ALIGN(len + 1, SZ_64K); |
113 | e->buf = kmalloc(size: e->size, ALLOW_FAIL); |
114 | if (!e->buf) { |
115 | e->size = PAGE_ALIGN(len + 1); |
116 | e->buf = kmalloc(size: e->size, GFP_KERNEL); |
117 | } |
118 | if (!e->buf) { |
119 | e->err = -ENOMEM; |
120 | return false; |
121 | } |
122 | |
123 | return true; |
124 | } |
125 | |
126 | __printf(2, 0) |
127 | static void i915_error_vprintf(struct drm_i915_error_state_buf *e, |
128 | const char *fmt, va_list args) |
129 | { |
130 | va_list ap; |
131 | int len; |
132 | |
133 | if (e->err) |
134 | return; |
135 | |
136 | va_copy(ap, args); |
137 | len = vsnprintf(NULL, size: 0, fmt, args: ap); |
138 | va_end(ap); |
139 | if (len <= 0) { |
140 | e->err = len; |
141 | return; |
142 | } |
143 | |
144 | if (!__i915_error_grow(e, len)) |
145 | return; |
146 | |
147 | GEM_BUG_ON(e->bytes >= e->size); |
148 | len = vscnprintf(buf: e->buf + e->bytes, size: e->size - e->bytes, fmt, args); |
149 | if (len < 0) { |
150 | e->err = len; |
151 | return; |
152 | } |
153 | e->bytes += len; |
154 | } |
155 | |
156 | static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str) |
157 | { |
158 | unsigned len; |
159 | |
160 | if (e->err || !str) |
161 | return; |
162 | |
163 | len = strlen(str); |
164 | if (!__i915_error_grow(e, len)) |
165 | return; |
166 | |
167 | GEM_BUG_ON(e->bytes + len > e->size); |
168 | memcpy(e->buf + e->bytes, str, len); |
169 | e->bytes += len; |
170 | } |
171 | |
172 | #define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__) |
173 | #define err_puts(e, s) i915_error_puts(e, s) |
174 | |
175 | static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf) |
176 | { |
177 | i915_error_vprintf(e: p->arg, fmt: vaf->fmt, args: *vaf->va); |
178 | } |
179 | |
180 | static inline struct drm_printer |
181 | i915_error_printer(struct drm_i915_error_state_buf *e) |
182 | { |
183 | struct drm_printer p = { |
184 | .printfn = __i915_printfn_error, |
185 | .arg = e, |
186 | }; |
187 | return p; |
188 | } |
189 | |
190 | /* single threaded page allocator with a reserved stash for emergencies */ |
191 | static void pool_fini(struct folio_batch *fbatch) |
192 | { |
193 | folio_batch_release(fbatch); |
194 | } |
195 | |
196 | static int pool_refill(struct folio_batch *fbatch, gfp_t gfp) |
197 | { |
198 | while (folio_batch_space(fbatch)) { |
199 | struct folio *folio; |
200 | |
201 | folio = folio_alloc(gfp, order: 0); |
202 | if (!folio) |
203 | return -ENOMEM; |
204 | |
205 | folio_batch_add(fbatch, folio); |
206 | } |
207 | |
208 | return 0; |
209 | } |
210 | |
211 | static int pool_init(struct folio_batch *fbatch, gfp_t gfp) |
212 | { |
213 | int err; |
214 | |
215 | folio_batch_init(fbatch); |
216 | |
217 | err = pool_refill(fbatch, gfp); |
218 | if (err) |
219 | pool_fini(fbatch); |
220 | |
221 | return err; |
222 | } |
223 | |
224 | static void *pool_alloc(struct folio_batch *fbatch, gfp_t gfp) |
225 | { |
226 | struct folio *folio; |
227 | |
228 | folio = folio_alloc(gfp, order: 0); |
229 | if (!folio && folio_batch_count(fbatch)) |
230 | folio = fbatch->folios[--fbatch->nr]; |
231 | |
232 | return folio ? folio_address(folio) : NULL; |
233 | } |
234 | |
235 | static void pool_free(struct folio_batch *fbatch, void *addr) |
236 | { |
237 | struct folio *folio = virt_to_folio(x: addr); |
238 | |
239 | if (folio_batch_space(fbatch)) |
240 | folio_batch_add(fbatch, folio); |
241 | else |
242 | folio_put(folio); |
243 | } |
244 | |
245 | #ifdef CONFIG_DRM_I915_COMPRESS_ERROR |
246 | |
247 | struct i915_vma_compress { |
248 | struct folio_batch pool; |
249 | struct z_stream_s zstream; |
250 | void *tmp; |
251 | }; |
252 | |
253 | static bool compress_init(struct i915_vma_compress *c) |
254 | { |
255 | struct z_stream_s *zstream = &c->zstream; |
256 | |
257 | if (pool_init(fbatch: &c->pool, ALLOW_FAIL)) |
258 | return false; |
259 | |
260 | zstream->workspace = |
261 | kmalloc(size: zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), |
262 | ALLOW_FAIL); |
263 | if (!zstream->workspace) { |
264 | pool_fini(fbatch: &c->pool); |
265 | return false; |
266 | } |
267 | |
268 | c->tmp = NULL; |
269 | if (i915_has_memcpy_from_wc()) |
270 | c->tmp = pool_alloc(fbatch: &c->pool, ALLOW_FAIL); |
271 | |
272 | return true; |
273 | } |
274 | |
275 | static bool compress_start(struct i915_vma_compress *c) |
276 | { |
277 | struct z_stream_s *zstream = &c->zstream; |
278 | void *workspace = zstream->workspace; |
279 | |
280 | memset(zstream, 0, sizeof(*zstream)); |
281 | zstream->workspace = workspace; |
282 | |
283 | return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; |
284 | } |
285 | |
286 | static void *compress_next_page(struct i915_vma_compress *c, |
287 | struct i915_vma_coredump *dst) |
288 | { |
289 | void *page_addr; |
290 | struct page *page; |
291 | |
292 | page_addr = pool_alloc(fbatch: &c->pool, ALLOW_FAIL); |
293 | if (!page_addr) |
294 | return ERR_PTR(error: -ENOMEM); |
295 | |
296 | page = virt_to_page(page_addr); |
297 | list_add_tail(new: &page->lru, head: &dst->page_list); |
298 | return page_addr; |
299 | } |
300 | |
301 | static int compress_page(struct i915_vma_compress *c, |
302 | void *src, |
303 | struct i915_vma_coredump *dst, |
304 | bool wc) |
305 | { |
306 | struct z_stream_s *zstream = &c->zstream; |
307 | |
308 | zstream->next_in = src; |
309 | if (wc && c->tmp && i915_memcpy_from_wc(dst: c->tmp, src, PAGE_SIZE)) |
310 | zstream->next_in = c->tmp; |
311 | zstream->avail_in = PAGE_SIZE; |
312 | |
313 | do { |
314 | if (zstream->avail_out == 0) { |
315 | zstream->next_out = compress_next_page(c, dst); |
316 | if (IS_ERR(ptr: zstream->next_out)) |
317 | return PTR_ERR(ptr: zstream->next_out); |
318 | |
319 | zstream->avail_out = PAGE_SIZE; |
320 | } |
321 | |
322 | if (zlib_deflate(strm: zstream, Z_NO_FLUSH) != Z_OK) |
323 | return -EIO; |
324 | |
325 | cond_resched(); |
326 | } while (zstream->avail_in); |
327 | |
328 | /* Fallback to uncompressed if we increase size? */ |
329 | if (0 && zstream->total_out > zstream->total_in) |
330 | return -E2BIG; |
331 | |
332 | return 0; |
333 | } |
334 | |
335 | static int compress_flush(struct i915_vma_compress *c, |
336 | struct i915_vma_coredump *dst) |
337 | { |
338 | struct z_stream_s *zstream = &c->zstream; |
339 | |
340 | do { |
341 | switch (zlib_deflate(strm: zstream, Z_FINISH)) { |
342 | case Z_OK: /* more space requested */ |
343 | zstream->next_out = compress_next_page(c, dst); |
344 | if (IS_ERR(ptr: zstream->next_out)) |
345 | return PTR_ERR(ptr: zstream->next_out); |
346 | |
347 | zstream->avail_out = PAGE_SIZE; |
348 | break; |
349 | |
350 | case Z_STREAM_END: |
351 | goto end; |
352 | |
353 | default: /* any error */ |
354 | return -EIO; |
355 | } |
356 | } while (1); |
357 | |
358 | end: |
359 | memset(zstream->next_out, 0, zstream->avail_out); |
360 | dst->unused = zstream->avail_out; |
361 | return 0; |
362 | } |
363 | |
364 | static void compress_finish(struct i915_vma_compress *c) |
365 | { |
366 | zlib_deflateEnd(strm: &c->zstream); |
367 | } |
368 | |
369 | static void compress_fini(struct i915_vma_compress *c) |
370 | { |
371 | kfree(objp: c->zstream.workspace); |
372 | if (c->tmp) |
373 | pool_free(fbatch: &c->pool, addr: c->tmp); |
374 | pool_fini(fbatch: &c->pool); |
375 | } |
376 | |
377 | static void err_compression_marker(struct drm_i915_error_state_buf *m) |
378 | { |
379 | err_puts(m, ":" ); |
380 | } |
381 | |
382 | #else |
383 | |
384 | struct i915_vma_compress { |
385 | struct folio_batch pool; |
386 | }; |
387 | |
388 | static bool compress_init(struct i915_vma_compress *c) |
389 | { |
390 | return pool_init(&c->pool, ALLOW_FAIL) == 0; |
391 | } |
392 | |
393 | static bool compress_start(struct i915_vma_compress *c) |
394 | { |
395 | return true; |
396 | } |
397 | |
398 | static int compress_page(struct i915_vma_compress *c, |
399 | void *src, |
400 | struct i915_vma_coredump *dst, |
401 | bool wc) |
402 | { |
403 | void *ptr; |
404 | |
405 | ptr = pool_alloc(&c->pool, ALLOW_FAIL); |
406 | if (!ptr) |
407 | return -ENOMEM; |
408 | |
409 | if (!(wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) |
410 | memcpy(ptr, src, PAGE_SIZE); |
411 | list_add_tail(&virt_to_page(ptr)->lru, &dst->page_list); |
412 | cond_resched(); |
413 | |
414 | return 0; |
415 | } |
416 | |
417 | static int compress_flush(struct i915_vma_compress *c, |
418 | struct i915_vma_coredump *dst) |
419 | { |
420 | return 0; |
421 | } |
422 | |
423 | static void compress_finish(struct i915_vma_compress *c) |
424 | { |
425 | } |
426 | |
427 | static void compress_fini(struct i915_vma_compress *c) |
428 | { |
429 | pool_fini(&c->pool); |
430 | } |
431 | |
432 | static void err_compression_marker(struct drm_i915_error_state_buf *m) |
433 | { |
434 | err_puts(m, "~" ); |
435 | } |
436 | |
437 | #endif |
438 | |
439 | static void error_print_instdone(struct drm_i915_error_state_buf *m, |
440 | const struct intel_engine_coredump *ee) |
441 | { |
442 | int slice; |
443 | int subslice; |
444 | int iter; |
445 | |
446 | err_printf(m, " INSTDONE: 0x%08x\n" , |
447 | ee->instdone.instdone); |
448 | |
449 | if (ee->engine->class != RENDER_CLASS || GRAPHICS_VER(m->i915) <= 3) |
450 | return; |
451 | |
452 | err_printf(m, " SC_INSTDONE: 0x%08x\n" , |
453 | ee->instdone.slice_common); |
454 | |
455 | if (GRAPHICS_VER(m->i915) <= 6) |
456 | return; |
457 | |
458 | for_each_ss_steering(iter, ee->engine->gt, slice, subslice) |
459 | err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n" , |
460 | slice, subslice, |
461 | ee->instdone.sampler[slice][subslice]); |
462 | |
463 | for_each_ss_steering(iter, ee->engine->gt, slice, subslice) |
464 | err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n" , |
465 | slice, subslice, |
466 | ee->instdone.row[slice][subslice]); |
467 | |
468 | if (GRAPHICS_VER(m->i915) < 12) |
469 | return; |
470 | |
471 | if (GRAPHICS_VER_FULL(m->i915) >= IP_VER(12, 55)) { |
472 | for_each_ss_steering(iter, ee->engine->gt, slice, subslice) |
473 | err_printf(m, " GEOM_SVGUNIT_INSTDONE[%d][%d]: 0x%08x\n" , |
474 | slice, subslice, |
475 | ee->instdone.geom_svg[slice][subslice]); |
476 | } |
477 | |
478 | err_printf(m, " SC_INSTDONE_EXTRA: 0x%08x\n" , |
479 | ee->instdone.slice_common_extra[0]); |
480 | err_printf(m, " SC_INSTDONE_EXTRA2: 0x%08x\n" , |
481 | ee->instdone.slice_common_extra[1]); |
482 | } |
483 | |
484 | static void error_print_request(struct drm_i915_error_state_buf *m, |
485 | const char *prefix, |
486 | const struct i915_request_coredump *erq) |
487 | { |
488 | if (!erq->seqno) |
489 | return; |
490 | |
491 | err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, head %08x, tail %08x\n" , |
492 | prefix, erq->pid, erq->context, erq->seqno, |
493 | test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, |
494 | &erq->flags) ? "!" : "" , |
495 | test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, |
496 | &erq->flags) ? "+" : "" , |
497 | erq->sched_attr.priority, |
498 | erq->head, erq->tail); |
499 | } |
500 | |
501 | static void error_print_context(struct drm_i915_error_state_buf *m, |
502 | const char *, |
503 | const struct i915_gem_context_coredump *ctx) |
504 | { |
505 | err_printf(m, "%s%s[%d] prio %d, guilty %d active %d, runtime total %lluns, avg %lluns\n" , |
506 | header, ctx->comm, ctx->pid, ctx->sched_attr.priority, |
507 | ctx->guilty, ctx->active, |
508 | ctx->total_runtime, ctx->avg_runtime); |
509 | err_printf(m, " context timeline seqno %u\n" , ctx->hwsp_seqno); |
510 | } |
511 | |
512 | static struct i915_vma_coredump * |
513 | __find_vma(struct i915_vma_coredump *vma, const char *name) |
514 | { |
515 | while (vma) { |
516 | if (strcmp(vma->name, name) == 0) |
517 | return vma; |
518 | vma = vma->next; |
519 | } |
520 | |
521 | return NULL; |
522 | } |
523 | |
524 | static struct i915_vma_coredump * |
525 | intel_gpu_error_find_batch(const struct intel_engine_coredump *ee) |
526 | { |
527 | return __find_vma(vma: ee->vma, name: "batch" ); |
528 | } |
529 | |
530 | static void error_print_engine(struct drm_i915_error_state_buf *m, |
531 | const struct intel_engine_coredump *ee) |
532 | { |
533 | struct i915_vma_coredump *batch; |
534 | int n; |
535 | |
536 | err_printf(m, "%s command stream:\n" , ee->engine->name); |
537 | err_printf(m, " CCID: 0x%08x\n" , ee->ccid); |
538 | err_printf(m, " START: 0x%08x\n" , ee->start); |
539 | err_printf(m, " HEAD: 0x%08x [0x%08x]\n" , ee->head, ee->rq_head); |
540 | err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n" , |
541 | ee->tail, ee->rq_post, ee->rq_tail); |
542 | err_printf(m, " CTL: 0x%08x\n" , ee->ctl); |
543 | err_printf(m, " MODE: 0x%08x\n" , ee->mode); |
544 | err_printf(m, " HWS: 0x%08x\n" , ee->hws); |
545 | err_printf(m, " ACTHD: 0x%08x %08x\n" , |
546 | (u32)(ee->acthd>>32), (u32)ee->acthd); |
547 | err_printf(m, " IPEIR: 0x%08x\n" , ee->ipeir); |
548 | err_printf(m, " IPEHR: 0x%08x\n" , ee->ipehr); |
549 | err_printf(m, " ESR: 0x%08x\n" , ee->esr); |
550 | |
551 | error_print_instdone(m, ee); |
552 | |
553 | batch = intel_gpu_error_find_batch(ee); |
554 | if (batch) { |
555 | u64 start = batch->gtt_offset; |
556 | u64 end = start + batch->gtt_size; |
557 | |
558 | err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n" , |
559 | upper_32_bits(start), lower_32_bits(start), |
560 | upper_32_bits(end), lower_32_bits(end)); |
561 | } |
562 | if (GRAPHICS_VER(m->i915) >= 4) { |
563 | err_printf(m, " BBADDR: 0x%08x_%08x\n" , |
564 | (u32)(ee->bbaddr>>32), (u32)ee->bbaddr); |
565 | err_printf(m, " BB_STATE: 0x%08x\n" , ee->bbstate); |
566 | err_printf(m, " INSTPS: 0x%08x\n" , ee->instps); |
567 | } |
568 | err_printf(m, " INSTPM: 0x%08x\n" , ee->instpm); |
569 | err_printf(m, " FADDR: 0x%08x %08x\n" , upper_32_bits(ee->faddr), |
570 | lower_32_bits(ee->faddr)); |
571 | if (GRAPHICS_VER(m->i915) >= 6) { |
572 | err_printf(m, " RC PSMI: 0x%08x\n" , ee->rc_psmi); |
573 | err_printf(m, " FAULT_REG: 0x%08x\n" , ee->fault_reg); |
574 | } |
575 | if (GRAPHICS_VER(m->i915) >= 11) { |
576 | err_printf(m, " NOPID: 0x%08x\n" , ee->nopid); |
577 | err_printf(m, " EXCC: 0x%08x\n" , ee->excc); |
578 | err_printf(m, " CMD_CCTL: 0x%08x\n" , ee->cmd_cctl); |
579 | err_printf(m, " CSCMDOP: 0x%08x\n" , ee->cscmdop); |
580 | err_printf(m, " CTX_SR_CTL: 0x%08x\n" , ee->ctx_sr_ctl); |
581 | err_printf(m, " DMA_FADDR_HI: 0x%08x\n" , ee->dma_faddr_hi); |
582 | err_printf(m, " DMA_FADDR_LO: 0x%08x\n" , ee->dma_faddr_lo); |
583 | } |
584 | if (HAS_PPGTT(m->i915)) { |
585 | err_printf(m, " GFX_MODE: 0x%08x\n" , ee->vm_info.gfx_mode); |
586 | |
587 | if (GRAPHICS_VER(m->i915) >= 8) { |
588 | int i; |
589 | for (i = 0; i < 4; i++) |
590 | err_printf(m, " PDP%d: 0x%016llx\n" , |
591 | i, ee->vm_info.pdp[i]); |
592 | } else { |
593 | err_printf(m, " PP_DIR_BASE: 0x%08x\n" , |
594 | ee->vm_info.pp_dir_base); |
595 | } |
596 | } |
597 | |
598 | for (n = 0; n < ee->num_ports; n++) { |
599 | err_printf(m, " ELSP[%d]:" , n); |
600 | error_print_request(m, prefix: " " , erq: &ee->execlist[n]); |
601 | } |
602 | } |
603 | |
604 | void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) |
605 | { |
606 | va_list args; |
607 | |
608 | va_start(args, f); |
609 | i915_error_vprintf(e, fmt: f, args); |
610 | va_end(args); |
611 | } |
612 | |
613 | static void intel_gpu_error_print_vma(struct drm_i915_error_state_buf *m, |
614 | const struct intel_engine_cs *engine, |
615 | const struct i915_vma_coredump *vma) |
616 | { |
617 | char out[ASCII85_BUFSZ]; |
618 | struct page *page; |
619 | |
620 | if (!vma) |
621 | return; |
622 | |
623 | err_printf(m, "%s --- %s = 0x%08x %08x\n" , |
624 | engine ? engine->name : "global" , vma->name, |
625 | upper_32_bits(vma->gtt_offset), |
626 | lower_32_bits(vma->gtt_offset)); |
627 | |
628 | if (vma->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) |
629 | err_printf(m, "gtt_page_sizes = 0x%08x\n" , vma->gtt_page_sizes); |
630 | |
631 | err_compression_marker(m); |
632 | list_for_each_entry(page, &vma->page_list, lru) { |
633 | int i, len; |
634 | const u32 *addr = page_address(page); |
635 | |
636 | len = PAGE_SIZE; |
637 | if (page == list_last_entry(&vma->page_list, typeof(*page), lru)) |
638 | len -= vma->unused; |
639 | len = ascii85_encode_len(len); |
640 | |
641 | for (i = 0; i < len; i++) |
642 | err_puts(m, ascii85_encode(addr[i], out)); |
643 | } |
644 | err_puts(m, "\n" ); |
645 | } |
646 | |
647 | static void err_print_capabilities(struct drm_i915_error_state_buf *m, |
648 | struct i915_gpu_coredump *error) |
649 | { |
650 | struct drm_printer p = i915_error_printer(e: m); |
651 | |
652 | intel_device_info_print(info: &error->device_info, runtime: &error->runtime_info, p: &p); |
653 | intel_display_device_info_print(info: &error->display_device_info, |
654 | runtime: &error->display_runtime_info, p: &p); |
655 | intel_driver_caps_print(caps: &error->driver_caps, p: &p); |
656 | } |
657 | |
658 | static void err_print_params(struct drm_i915_error_state_buf *m, |
659 | const struct i915_params *params) |
660 | { |
661 | struct drm_printer p = i915_error_printer(e: m); |
662 | |
663 | i915_params_dump(params, p: &p); |
664 | intel_display_params_dump(i915: m->i915, p: &p); |
665 | } |
666 | |
667 | static void err_print_pciid(struct drm_i915_error_state_buf *m, |
668 | struct drm_i915_private *i915) |
669 | { |
670 | struct pci_dev *pdev = to_pci_dev(i915->drm.dev); |
671 | |
672 | err_printf(m, "PCI ID: 0x%04x\n" , pdev->device); |
673 | err_printf(m, "PCI Revision: 0x%02x\n" , pdev->revision); |
674 | err_printf(m, "PCI Subsystem: %04x:%04x\n" , |
675 | pdev->subsystem_vendor, |
676 | pdev->subsystem_device); |
677 | } |
678 | |
679 | static void err_print_guc_ctb(struct drm_i915_error_state_buf *m, |
680 | const char *name, |
681 | const struct intel_ctb_coredump *ctb) |
682 | { |
683 | if (!ctb->size) |
684 | return; |
685 | |
686 | err_printf(m, "GuC %s CTB: raw: 0x%08X, 0x%08X/%08X, cached: 0x%08X/%08X, desc = 0x%08X, buf = 0x%08X x 0x%08X\n" , |
687 | name, ctb->raw_status, ctb->raw_head, ctb->raw_tail, |
688 | ctb->head, ctb->tail, ctb->desc_offset, ctb->cmds_offset, ctb->size); |
689 | } |
690 | |
691 | static void err_print_uc(struct drm_i915_error_state_buf *m, |
692 | const struct intel_uc_coredump *error_uc) |
693 | { |
694 | struct drm_printer p = i915_error_printer(e: m); |
695 | |
696 | intel_uc_fw_dump(uc_fw: &error_uc->guc_fw, p: &p); |
697 | intel_uc_fw_dump(uc_fw: &error_uc->huc_fw, p: &p); |
698 | err_printf(m, "GuC timestamp: 0x%08x\n" , error_uc->guc.timestamp); |
699 | intel_gpu_error_print_vma(m, NULL, vma: error_uc->guc.vma_log); |
700 | err_printf(m, "GuC CTB fence: %d\n" , error_uc->guc.last_fence); |
701 | err_print_guc_ctb(m, name: "Send" , ctb: error_uc->guc.ctb + 0); |
702 | err_print_guc_ctb(m, name: "Recv" , ctb: error_uc->guc.ctb + 1); |
703 | intel_gpu_error_print_vma(m, NULL, vma: error_uc->guc.vma_ctb); |
704 | } |
705 | |
706 | static void err_free_sgl(struct scatterlist *sgl) |
707 | { |
708 | while (sgl) { |
709 | struct scatterlist *sg; |
710 | |
711 | for (sg = sgl; !sg_is_chain(sg); sg++) { |
712 | kfree(objp: sg_virt(sg)); |
713 | if (sg_is_last(sg)) |
714 | break; |
715 | } |
716 | |
717 | sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg); |
718 | free_page((unsigned long)sgl); |
719 | sgl = sg; |
720 | } |
721 | } |
722 | |
723 | static void err_print_gt_info(struct drm_i915_error_state_buf *m, |
724 | struct intel_gt_coredump *gt) |
725 | { |
726 | struct drm_printer p = i915_error_printer(e: m); |
727 | |
728 | intel_gt_info_print(info: >->info, p: &p); |
729 | intel_sseu_print_topology(i915: gt->_gt->i915, sseu: >->info.sseu, p: &p); |
730 | } |
731 | |
732 | static void err_print_gt_display(struct drm_i915_error_state_buf *m, |
733 | struct intel_gt_coredump *gt) |
734 | { |
735 | err_printf(m, "IER: 0x%08x\n" , gt->ier); |
736 | err_printf(m, "DERRMR: 0x%08x\n" , gt->derrmr); |
737 | } |
738 | |
739 | static void err_print_gt_global_nonguc(struct drm_i915_error_state_buf *m, |
740 | struct intel_gt_coredump *gt) |
741 | { |
742 | int i; |
743 | |
744 | err_printf(m, "GT awake: %s\n" , str_yes_no(gt->awake)); |
745 | err_printf(m, "CS timestamp frequency: %u Hz, %d ns\n" , |
746 | gt->clock_frequency, gt->clock_period_ns); |
747 | err_printf(m, "EIR: 0x%08x\n" , gt->eir); |
748 | err_printf(m, "PGTBL_ER: 0x%08x\n" , gt->pgtbl_er); |
749 | |
750 | for (i = 0; i < gt->ngtier; i++) |
751 | err_printf(m, "GTIER[%d]: 0x%08x\n" , i, gt->gtier[i]); |
752 | } |
753 | |
754 | static void err_print_gt_global(struct drm_i915_error_state_buf *m, |
755 | struct intel_gt_coredump *gt) |
756 | { |
757 | err_printf(m, "FORCEWAKE: 0x%08x\n" , gt->forcewake); |
758 | |
759 | if (IS_GRAPHICS_VER(m->i915, 6, 11)) { |
760 | err_printf(m, "ERROR: 0x%08x\n" , gt->error); |
761 | err_printf(m, "DONE_REG: 0x%08x\n" , gt->done_reg); |
762 | } |
763 | |
764 | if (GRAPHICS_VER(m->i915) >= 8) |
765 | err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n" , |
766 | gt->fault_data1, gt->fault_data0); |
767 | |
768 | if (GRAPHICS_VER(m->i915) == 7) |
769 | err_printf(m, "ERR_INT: 0x%08x\n" , gt->err_int); |
770 | |
771 | if (IS_GRAPHICS_VER(m->i915, 8, 11)) |
772 | err_printf(m, "GTT_CACHE_EN: 0x%08x\n" , gt->gtt_cache); |
773 | |
774 | if (GRAPHICS_VER(m->i915) == 12) |
775 | err_printf(m, "AUX_ERR_DBG: 0x%08x\n" , gt->aux_err); |
776 | |
777 | if (GRAPHICS_VER(m->i915) >= 12) { |
778 | int i; |
779 | |
780 | for (i = 0; i < I915_MAX_SFC; i++) { |
781 | /* |
782 | * SFC_DONE resides in the VD forcewake domain, so it |
783 | * only exists if the corresponding VCS engine is |
784 | * present. |
785 | */ |
786 | if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 || |
787 | !HAS_ENGINE(gt->_gt, _VCS(i * 2))) |
788 | continue; |
789 | |
790 | err_printf(m, " SFC_DONE[%d]: 0x%08x\n" , i, |
791 | gt->sfc_done[i]); |
792 | } |
793 | |
794 | err_printf(m, " GAM_DONE: 0x%08x\n" , gt->gam_done); |
795 | } |
796 | } |
797 | |
798 | static void err_print_gt_fences(struct drm_i915_error_state_buf *m, |
799 | struct intel_gt_coredump *gt) |
800 | { |
801 | int i; |
802 | |
803 | for (i = 0; i < gt->nfence; i++) |
804 | err_printf(m, " fence[%d] = %08llx\n" , i, gt->fence[i]); |
805 | } |
806 | |
807 | static void err_print_gt_engines(struct drm_i915_error_state_buf *m, |
808 | struct intel_gt_coredump *gt) |
809 | { |
810 | const struct intel_engine_coredump *ee; |
811 | |
812 | for (ee = gt->engine; ee; ee = ee->next) { |
813 | const struct i915_vma_coredump *vma; |
814 | |
815 | if (gt->uc && gt->uc->guc.is_guc_capture) { |
816 | if (ee->guc_capture_node) |
817 | intel_guc_capture_print_engine_node(m, ee); |
818 | else |
819 | err_printf(m, " Missing GuC capture node for %s\n" , |
820 | ee->engine->name); |
821 | } else { |
822 | error_print_engine(m, ee); |
823 | } |
824 | |
825 | err_printf(m, " hung: %u\n" , ee->hung); |
826 | err_printf(m, " engine reset count: %u\n" , ee->reset_count); |
827 | error_print_context(m, header: " Active context: " , ctx: &ee->context); |
828 | |
829 | for (vma = ee->vma; vma; vma = vma->next) |
830 | intel_gpu_error_print_vma(m, engine: ee->engine, vma); |
831 | } |
832 | |
833 | } |
834 | |
835 | static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, |
836 | struct i915_gpu_coredump *error) |
837 | { |
838 | const struct intel_engine_coredump *ee; |
839 | struct timespec64 ts; |
840 | |
841 | if (*error->error_msg) |
842 | err_printf(m, "%s\n" , error->error_msg); |
843 | err_printf(m, "Kernel: %s %s\n" , |
844 | init_utsname()->release, |
845 | init_utsname()->machine); |
846 | err_printf(m, "Driver: %s\n" , DRIVER_DATE); |
847 | ts = ktime_to_timespec64(error->time); |
848 | err_printf(m, "Time: %lld s %ld us\n" , |
849 | (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); |
850 | ts = ktime_to_timespec64(error->boottime); |
851 | err_printf(m, "Boottime: %lld s %ld us\n" , |
852 | (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); |
853 | ts = ktime_to_timespec64(error->uptime); |
854 | err_printf(m, "Uptime: %lld s %ld us\n" , |
855 | (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); |
856 | err_printf(m, "Capture: %lu jiffies; %d ms ago\n" , |
857 | error->capture, jiffies_to_msecs(jiffies - error->capture)); |
858 | |
859 | for (ee = error->gt ? error->gt->engine : NULL; ee; ee = ee->next) |
860 | err_printf(m, "Active process (on ring %s): %s [%d]\n" , |
861 | ee->engine->name, |
862 | ee->context.comm, |
863 | ee->context.pid); |
864 | |
865 | err_printf(m, "Reset count: %u\n" , error->reset_count); |
866 | err_printf(m, "Suspend count: %u\n" , error->suspend_count); |
867 | err_printf(m, "Platform: %s\n" , intel_platform_name(error->device_info.platform)); |
868 | err_printf(m, "Subplatform: 0x%x\n" , |
869 | intel_subplatform(&error->runtime_info, |
870 | error->device_info.platform)); |
871 | err_print_pciid(m, i915: m->i915); |
872 | |
873 | err_printf(m, "IOMMU enabled?: %d\n" , error->iommu); |
874 | |
875 | intel_dmc_print_error_state(m, i915: m->i915); |
876 | |
877 | err_printf(m, "RPM wakelock: %s\n" , str_yes_no(error->wakelock)); |
878 | err_printf(m, "PM suspended: %s\n" , str_yes_no(error->suspended)); |
879 | |
880 | if (error->gt) { |
881 | bool print_guc_capture = false; |
882 | |
883 | if (error->gt->uc && error->gt->uc->guc.is_guc_capture) |
884 | print_guc_capture = true; |
885 | |
886 | err_print_gt_display(m, gt: error->gt); |
887 | err_print_gt_global_nonguc(m, gt: error->gt); |
888 | err_print_gt_fences(m, gt: error->gt); |
889 | |
890 | /* |
891 | * GuC dumped global, eng-class and eng-instance registers together |
892 | * as part of engine state dump so we print in err_print_gt_engines |
893 | */ |
894 | if (!print_guc_capture) |
895 | err_print_gt_global(m, gt: error->gt); |
896 | |
897 | err_print_gt_engines(m, gt: error->gt); |
898 | |
899 | if (error->gt->uc) |
900 | err_print_uc(m, error_uc: error->gt->uc); |
901 | |
902 | err_print_gt_info(m, gt: error->gt); |
903 | } |
904 | |
905 | if (error->overlay) |
906 | intel_overlay_print_error_state(e: m, error: error->overlay); |
907 | |
908 | err_print_capabilities(m, error); |
909 | err_print_params(m, params: &error->params); |
910 | } |
911 | |
912 | static int err_print_to_sgl(struct i915_gpu_coredump *error) |
913 | { |
914 | struct drm_i915_error_state_buf m; |
915 | |
916 | if (IS_ERR(ptr: error)) |
917 | return PTR_ERR(ptr: error); |
918 | |
919 | if (READ_ONCE(error->sgl)) |
920 | return 0; |
921 | |
922 | memset(&m, 0, sizeof(m)); |
923 | m.i915 = error->i915; |
924 | |
925 | __err_print_to_sgl(m: &m, error); |
926 | |
927 | if (m.buf) { |
928 | __sg_set_buf(sg: m.cur++, addr: m.buf, len: m.bytes, it: m.iter); |
929 | m.bytes = 0; |
930 | m.buf = NULL; |
931 | } |
932 | if (m.cur) { |
933 | GEM_BUG_ON(m.end < m.cur); |
934 | sg_mark_end(sg: m.cur - 1); |
935 | } |
936 | GEM_BUG_ON(m.sgl && !m.cur); |
937 | |
938 | if (m.err) { |
939 | err_free_sgl(sgl: m.sgl); |
940 | return m.err; |
941 | } |
942 | |
943 | if (cmpxchg(&error->sgl, NULL, m.sgl)) |
944 | err_free_sgl(sgl: m.sgl); |
945 | |
946 | return 0; |
947 | } |
948 | |
949 | ssize_t i915_gpu_coredump_copy_to_buffer(struct i915_gpu_coredump *error, |
950 | char *buf, loff_t off, size_t rem) |
951 | { |
952 | struct scatterlist *sg; |
953 | size_t count; |
954 | loff_t pos; |
955 | int err; |
956 | |
957 | if (!error || !rem) |
958 | return 0; |
959 | |
960 | err = err_print_to_sgl(error); |
961 | if (err) |
962 | return err; |
963 | |
964 | sg = READ_ONCE(error->fit); |
965 | if (!sg || off < sg->dma_address) |
966 | sg = error->sgl; |
967 | if (!sg) |
968 | return 0; |
969 | |
970 | pos = sg->dma_address; |
971 | count = 0; |
972 | do { |
973 | size_t len, start; |
974 | |
975 | if (sg_is_chain(sg)) { |
976 | sg = sg_chain_ptr(sg); |
977 | GEM_BUG_ON(sg_is_chain(sg)); |
978 | } |
979 | |
980 | len = sg->length; |
981 | if (pos + len <= off) { |
982 | pos += len; |
983 | continue; |
984 | } |
985 | |
986 | start = sg->offset; |
987 | if (pos < off) { |
988 | GEM_BUG_ON(off - pos > len); |
989 | len -= off - pos; |
990 | start += off - pos; |
991 | pos = off; |
992 | } |
993 | |
994 | len = min(len, rem); |
995 | GEM_BUG_ON(!len || len > sg->length); |
996 | |
997 | memcpy(buf, page_address(sg_page(sg)) + start, len); |
998 | |
999 | count += len; |
1000 | pos += len; |
1001 | |
1002 | buf += len; |
1003 | rem -= len; |
1004 | if (!rem) { |
1005 | WRITE_ONCE(error->fit, sg); |
1006 | break; |
1007 | } |
1008 | } while (!sg_is_last(sg: sg++)); |
1009 | |
1010 | return count; |
1011 | } |
1012 | |
1013 | static void i915_vma_coredump_free(struct i915_vma_coredump *vma) |
1014 | { |
1015 | while (vma) { |
1016 | struct i915_vma_coredump *next = vma->next; |
1017 | struct page *page, *n; |
1018 | |
1019 | list_for_each_entry_safe(page, n, &vma->page_list, lru) { |
1020 | list_del_init(entry: &page->lru); |
1021 | __free_page(page); |
1022 | } |
1023 | |
1024 | kfree(objp: vma); |
1025 | vma = next; |
1026 | } |
1027 | } |
1028 | |
1029 | static void cleanup_params(struct i915_gpu_coredump *error) |
1030 | { |
1031 | i915_params_free(params: &error->params); |
1032 | intel_display_params_free(params: &error->display_params); |
1033 | } |
1034 | |
1035 | static void cleanup_uc(struct intel_uc_coredump *uc) |
1036 | { |
1037 | kfree(objp: uc->guc_fw.file_selected.path); |
1038 | kfree(objp: uc->huc_fw.file_selected.path); |
1039 | kfree(objp: uc->guc_fw.file_wanted.path); |
1040 | kfree(objp: uc->huc_fw.file_wanted.path); |
1041 | i915_vma_coredump_free(vma: uc->guc.vma_log); |
1042 | i915_vma_coredump_free(vma: uc->guc.vma_ctb); |
1043 | |
1044 | kfree(objp: uc); |
1045 | } |
1046 | |
1047 | static void cleanup_gt(struct intel_gt_coredump *gt) |
1048 | { |
1049 | while (gt->engine) { |
1050 | struct intel_engine_coredump *ee = gt->engine; |
1051 | |
1052 | gt->engine = ee->next; |
1053 | |
1054 | i915_vma_coredump_free(vma: ee->vma); |
1055 | intel_guc_capture_free_node(ee); |
1056 | kfree(objp: ee); |
1057 | } |
1058 | |
1059 | if (gt->uc) |
1060 | cleanup_uc(uc: gt->uc); |
1061 | |
1062 | kfree(objp: gt); |
1063 | } |
1064 | |
1065 | void __i915_gpu_coredump_free(struct kref *error_ref) |
1066 | { |
1067 | struct i915_gpu_coredump *error = |
1068 | container_of(error_ref, typeof(*error), ref); |
1069 | |
1070 | while (error->gt) { |
1071 | struct intel_gt_coredump *gt = error->gt; |
1072 | |
1073 | error->gt = gt->next; |
1074 | cleanup_gt(gt); |
1075 | } |
1076 | |
1077 | kfree(objp: error->overlay); |
1078 | |
1079 | cleanup_params(error); |
1080 | |
1081 | err_free_sgl(sgl: error->sgl); |
1082 | kfree(objp: error); |
1083 | } |
1084 | |
1085 | static struct i915_vma_coredump * |
1086 | i915_vma_coredump_create(const struct intel_gt *gt, |
1087 | const struct i915_vma_resource *vma_res, |
1088 | struct i915_vma_compress *compress, |
1089 | const char *name) |
1090 | |
1091 | { |
1092 | struct i915_ggtt *ggtt = gt->ggtt; |
1093 | const u64 slot = ggtt->error_capture.start; |
1094 | struct i915_vma_coredump *dst; |
1095 | struct sgt_iter iter; |
1096 | int ret; |
1097 | |
1098 | might_sleep(); |
1099 | |
1100 | if (!vma_res || !vma_res->bi.pages || !compress) |
1101 | return NULL; |
1102 | |
1103 | dst = kmalloc(size: sizeof(*dst), ALLOW_FAIL); |
1104 | if (!dst) |
1105 | return NULL; |
1106 | |
1107 | if (!compress_start(c: compress)) { |
1108 | kfree(objp: dst); |
1109 | return NULL; |
1110 | } |
1111 | |
1112 | INIT_LIST_HEAD(list: &dst->page_list); |
1113 | strcpy(p: dst->name, q: name); |
1114 | dst->next = NULL; |
1115 | |
1116 | dst->gtt_offset = vma_res->start; |
1117 | dst->gtt_size = vma_res->node_size; |
1118 | dst->gtt_page_sizes = vma_res->page_sizes_gtt; |
1119 | dst->unused = 0; |
1120 | |
1121 | ret = -EINVAL; |
1122 | if (drm_mm_node_allocated(node: &ggtt->error_capture)) { |
1123 | void __iomem *s; |
1124 | dma_addr_t dma; |
1125 | |
1126 | for_each_sgt_daddr(dma, iter, vma_res->bi.pages) { |
1127 | mutex_lock(&ggtt->error_mutex); |
1128 | if (ggtt->vm.raw_insert_page) |
1129 | ggtt->vm.raw_insert_page(&ggtt->vm, dma, slot, |
1130 | i915_gem_get_pat_index(i915: gt->i915, |
1131 | level: I915_CACHE_NONE), |
1132 | 0); |
1133 | else |
1134 | ggtt->vm.insert_page(&ggtt->vm, dma, slot, |
1135 | i915_gem_get_pat_index(i915: gt->i915, |
1136 | level: I915_CACHE_NONE), |
1137 | 0); |
1138 | mb(); |
1139 | |
1140 | s = io_mapping_map_wc(mapping: &ggtt->iomap, offset: slot, PAGE_SIZE); |
1141 | ret = compress_page(c: compress, |
1142 | src: (void __force *)s, dst, |
1143 | wc: true); |
1144 | io_mapping_unmap(vaddr: s); |
1145 | |
1146 | mb(); |
1147 | ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); |
1148 | mutex_unlock(lock: &ggtt->error_mutex); |
1149 | if (ret) |
1150 | break; |
1151 | } |
1152 | } else if (vma_res->bi.lmem) { |
1153 | struct intel_memory_region *mem = vma_res->mr; |
1154 | dma_addr_t dma; |
1155 | |
1156 | for_each_sgt_daddr(dma, iter, vma_res->bi.pages) { |
1157 | dma_addr_t offset = dma - mem->region.start; |
1158 | void __iomem *s; |
1159 | |
1160 | if (offset + PAGE_SIZE > resource_size(res: &mem->io)) { |
1161 | ret = -EINVAL; |
1162 | break; |
1163 | } |
1164 | |
1165 | s = io_mapping_map_wc(mapping: &mem->iomap, offset, PAGE_SIZE); |
1166 | ret = compress_page(c: compress, |
1167 | src: (void __force *)s, dst, |
1168 | wc: true); |
1169 | io_mapping_unmap(vaddr: s); |
1170 | if (ret) |
1171 | break; |
1172 | } |
1173 | } else { |
1174 | struct page *page; |
1175 | |
1176 | for_each_sgt_page(page, iter, vma_res->bi.pages) { |
1177 | void *s; |
1178 | |
1179 | drm_clflush_pages(pages: &page, num_pages: 1); |
1180 | |
1181 | s = kmap_local_page(page); |
1182 | ret = compress_page(c: compress, src: s, dst, wc: false); |
1183 | kunmap_local(s); |
1184 | |
1185 | drm_clflush_pages(pages: &page, num_pages: 1); |
1186 | |
1187 | if (ret) |
1188 | break; |
1189 | } |
1190 | } |
1191 | |
1192 | if (ret || compress_flush(c: compress, dst)) { |
1193 | struct page *page, *n; |
1194 | |
1195 | list_for_each_entry_safe_reverse(page, n, &dst->page_list, lru) { |
1196 | list_del_init(entry: &page->lru); |
1197 | pool_free(fbatch: &compress->pool, page_address(page)); |
1198 | } |
1199 | |
1200 | kfree(objp: dst); |
1201 | dst = NULL; |
1202 | } |
1203 | compress_finish(c: compress); |
1204 | |
1205 | return dst; |
1206 | } |
1207 | |
1208 | static void gt_record_fences(struct intel_gt_coredump *gt) |
1209 | { |
1210 | struct i915_ggtt *ggtt = gt->_gt->ggtt; |
1211 | struct intel_uncore *uncore = gt->_gt->uncore; |
1212 | int i; |
1213 | |
1214 | if (GRAPHICS_VER(uncore->i915) >= 6) { |
1215 | for (i = 0; i < ggtt->num_fences; i++) |
1216 | gt->fence[i] = |
1217 | intel_uncore_read64(uncore, |
1218 | FENCE_REG_GEN6_LO(i)); |
1219 | } else if (GRAPHICS_VER(uncore->i915) >= 4) { |
1220 | for (i = 0; i < ggtt->num_fences; i++) |
1221 | gt->fence[i] = |
1222 | intel_uncore_read64(uncore, |
1223 | FENCE_REG_965_LO(i)); |
1224 | } else { |
1225 | for (i = 0; i < ggtt->num_fences; i++) |
1226 | gt->fence[i] = |
1227 | intel_uncore_read(uncore, FENCE_REG(i)); |
1228 | } |
1229 | gt->nfence = i; |
1230 | } |
1231 | |
1232 | static void engine_record_registers(struct intel_engine_coredump *ee) |
1233 | { |
1234 | const struct intel_engine_cs *engine = ee->engine; |
1235 | struct drm_i915_private *i915 = engine->i915; |
1236 | |
1237 | if (GRAPHICS_VER(i915) >= 6) { |
1238 | ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); |
1239 | |
1240 | /* |
1241 | * For the media GT, this ring fault register is not replicated, |
1242 | * so don't do multicast/replicated register read/write |
1243 | * operation on it. |
1244 | */ |
1245 | if (MEDIA_VER(i915) >= 13 && engine->gt->type == GT_MEDIA) |
1246 | ee->fault_reg = intel_uncore_read(uncore: engine->uncore, |
1247 | XELPMP_RING_FAULT_REG); |
1248 | |
1249 | else if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) |
1250 | ee->fault_reg = intel_gt_mcr_read_any(gt: engine->gt, |
1251 | XEHP_RING_FAULT_REG); |
1252 | else if (GRAPHICS_VER(i915) >= 12) |
1253 | ee->fault_reg = intel_uncore_read(uncore: engine->uncore, |
1254 | GEN12_RING_FAULT_REG); |
1255 | else if (GRAPHICS_VER(i915) >= 8) |
1256 | ee->fault_reg = intel_uncore_read(uncore: engine->uncore, |
1257 | GEN8_RING_FAULT_REG); |
1258 | else |
1259 | ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); |
1260 | } |
1261 | |
1262 | if (GRAPHICS_VER(i915) >= 4) { |
1263 | ee->esr = ENGINE_READ(engine, RING_ESR); |
1264 | ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); |
1265 | ee->ipeir = ENGINE_READ(engine, RING_IPEIR); |
1266 | ee->ipehr = ENGINE_READ(engine, RING_IPEHR); |
1267 | ee->instps = ENGINE_READ(engine, RING_INSTPS); |
1268 | ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); |
1269 | ee->ccid = ENGINE_READ(engine, CCID); |
1270 | if (GRAPHICS_VER(i915) >= 8) { |
1271 | ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; |
1272 | ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; |
1273 | } |
1274 | ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); |
1275 | } else { |
1276 | ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); |
1277 | ee->ipeir = ENGINE_READ(engine, IPEIR); |
1278 | ee->ipehr = ENGINE_READ(engine, IPEHR); |
1279 | } |
1280 | |
1281 | if (GRAPHICS_VER(i915) >= 11) { |
1282 | ee->cmd_cctl = ENGINE_READ(engine, RING_CMD_CCTL); |
1283 | ee->cscmdop = ENGINE_READ(engine, RING_CSCMDOP); |
1284 | ee->ctx_sr_ctl = ENGINE_READ(engine, RING_CTX_SR_CTL); |
1285 | ee->dma_faddr_hi = ENGINE_READ(engine, RING_DMA_FADD_UDW); |
1286 | ee->dma_faddr_lo = ENGINE_READ(engine, RING_DMA_FADD); |
1287 | ee->nopid = ENGINE_READ(engine, RING_NOPID); |
1288 | ee->excc = ENGINE_READ(engine, RING_EXCC); |
1289 | } |
1290 | |
1291 | intel_engine_get_instdone(engine, instdone: &ee->instdone); |
1292 | |
1293 | ee->instpm = ENGINE_READ(engine, RING_INSTPM); |
1294 | ee->acthd = intel_engine_get_active_head(engine); |
1295 | ee->start = ENGINE_READ(engine, RING_START); |
1296 | ee->head = ENGINE_READ(engine, RING_HEAD); |
1297 | ee->tail = ENGINE_READ(engine, RING_TAIL); |
1298 | ee->ctl = ENGINE_READ(engine, RING_CTL); |
1299 | if (GRAPHICS_VER(i915) > 2) |
1300 | ee->mode = ENGINE_READ(engine, RING_MI_MODE); |
1301 | |
1302 | if (!HWS_NEEDS_PHYSICAL(i915)) { |
1303 | i915_reg_t mmio; |
1304 | |
1305 | if (GRAPHICS_VER(i915) == 7) { |
1306 | switch (engine->id) { |
1307 | default: |
1308 | MISSING_CASE(engine->id); |
1309 | fallthrough; |
1310 | case RCS0: |
1311 | mmio = RENDER_HWS_PGA_GEN7; |
1312 | break; |
1313 | case BCS0: |
1314 | mmio = BLT_HWS_PGA_GEN7; |
1315 | break; |
1316 | case VCS0: |
1317 | mmio = BSD_HWS_PGA_GEN7; |
1318 | break; |
1319 | case VECS0: |
1320 | mmio = VEBOX_HWS_PGA_GEN7; |
1321 | break; |
1322 | } |
1323 | } else if (GRAPHICS_VER(engine->i915) == 6) { |
1324 | mmio = RING_HWS_PGA_GEN6(engine->mmio_base); |
1325 | } else { |
1326 | /* XXX: gen8 returns to sanity */ |
1327 | mmio = RING_HWS_PGA(engine->mmio_base); |
1328 | } |
1329 | |
1330 | ee->hws = intel_uncore_read(uncore: engine->uncore, reg: mmio); |
1331 | } |
1332 | |
1333 | ee->reset_count = i915_reset_engine_count(error: &i915->gpu_error, engine); |
1334 | |
1335 | if (HAS_PPGTT(i915)) { |
1336 | int i; |
1337 | |
1338 | ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); |
1339 | |
1340 | if (GRAPHICS_VER(i915) == 6) { |
1341 | ee->vm_info.pp_dir_base = |
1342 | ENGINE_READ(engine, RING_PP_DIR_BASE_READ); |
1343 | } else if (GRAPHICS_VER(i915) == 7) { |
1344 | ee->vm_info.pp_dir_base = |
1345 | ENGINE_READ(engine, RING_PP_DIR_BASE); |
1346 | } else if (GRAPHICS_VER(i915) >= 8) { |
1347 | u32 base = engine->mmio_base; |
1348 | |
1349 | for (i = 0; i < 4; i++) { |
1350 | ee->vm_info.pdp[i] = |
1351 | intel_uncore_read(uncore: engine->uncore, |
1352 | GEN8_RING_PDP_UDW(base, i)); |
1353 | ee->vm_info.pdp[i] <<= 32; |
1354 | ee->vm_info.pdp[i] |= |
1355 | intel_uncore_read(uncore: engine->uncore, |
1356 | GEN8_RING_PDP_LDW(base, i)); |
1357 | } |
1358 | } |
1359 | } |
1360 | } |
1361 | |
1362 | static void record_request(const struct i915_request *request, |
1363 | struct i915_request_coredump *erq) |
1364 | { |
1365 | erq->flags = request->fence.flags; |
1366 | erq->context = request->fence.context; |
1367 | erq->seqno = request->fence.seqno; |
1368 | erq->sched_attr = request->sched.attr; |
1369 | erq->head = request->head; |
1370 | erq->tail = request->tail; |
1371 | |
1372 | erq->pid = 0; |
1373 | rcu_read_lock(); |
1374 | if (!intel_context_is_closed(ce: request->context)) { |
1375 | const struct i915_gem_context *ctx; |
1376 | |
1377 | ctx = rcu_dereference(request->context->gem_context); |
1378 | if (ctx) |
1379 | erq->pid = pid_nr(pid: ctx->pid); |
1380 | } |
1381 | rcu_read_unlock(); |
1382 | } |
1383 | |
1384 | static void engine_record_execlists(struct intel_engine_coredump *ee) |
1385 | { |
1386 | const struct intel_engine_execlists * const el = &ee->engine->execlists; |
1387 | struct i915_request * const *port = el->active; |
1388 | unsigned int n = 0; |
1389 | |
1390 | while (*port) |
1391 | record_request(request: *port++, erq: &ee->execlist[n++]); |
1392 | |
1393 | ee->num_ports = n; |
1394 | } |
1395 | |
1396 | static bool record_context(struct i915_gem_context_coredump *e, |
1397 | struct intel_context *ce) |
1398 | { |
1399 | struct i915_gem_context *ctx; |
1400 | struct task_struct *task; |
1401 | bool simulated; |
1402 | |
1403 | rcu_read_lock(); |
1404 | ctx = rcu_dereference(ce->gem_context); |
1405 | if (ctx && !kref_get_unless_zero(kref: &ctx->ref)) |
1406 | ctx = NULL; |
1407 | rcu_read_unlock(); |
1408 | if (!ctx) |
1409 | return true; |
1410 | |
1411 | rcu_read_lock(); |
1412 | task = pid_task(pid: ctx->pid, PIDTYPE_PID); |
1413 | if (task) { |
1414 | strcpy(p: e->comm, q: task->comm); |
1415 | e->pid = task->pid; |
1416 | } |
1417 | rcu_read_unlock(); |
1418 | |
1419 | e->sched_attr = ctx->sched; |
1420 | e->guilty = atomic_read(v: &ctx->guilty_count); |
1421 | e->active = atomic_read(v: &ctx->active_count); |
1422 | e->hwsp_seqno = (ce->timeline && ce->timeline->hwsp_seqno) ? |
1423 | *ce->timeline->hwsp_seqno : ~0U; |
1424 | |
1425 | e->total_runtime = intel_context_get_total_runtime_ns(ce); |
1426 | e->avg_runtime = intel_context_get_avg_runtime_ns(ce); |
1427 | |
1428 | simulated = i915_gem_context_no_error_capture(ctx); |
1429 | |
1430 | i915_gem_context_put(ctx); |
1431 | return simulated; |
1432 | } |
1433 | |
1434 | struct intel_engine_capture_vma { |
1435 | struct intel_engine_capture_vma *next; |
1436 | struct i915_vma_resource *vma_res; |
1437 | char name[16]; |
1438 | bool lockdep_cookie; |
1439 | }; |
1440 | |
1441 | static struct intel_engine_capture_vma * |
1442 | capture_vma_snapshot(struct intel_engine_capture_vma *next, |
1443 | struct i915_vma_resource *vma_res, |
1444 | gfp_t gfp, const char *name) |
1445 | { |
1446 | struct intel_engine_capture_vma *c; |
1447 | |
1448 | if (!vma_res) |
1449 | return next; |
1450 | |
1451 | c = kmalloc(size: sizeof(*c), flags: gfp); |
1452 | if (!c) |
1453 | return next; |
1454 | |
1455 | if (!i915_vma_resource_hold(vma_res, lockdep_cookie: &c->lockdep_cookie)) { |
1456 | kfree(objp: c); |
1457 | return next; |
1458 | } |
1459 | |
1460 | strcpy(p: c->name, q: name); |
1461 | c->vma_res = i915_vma_resource_get(vma_res); |
1462 | |
1463 | c->next = next; |
1464 | return c; |
1465 | } |
1466 | |
1467 | static struct intel_engine_capture_vma * |
1468 | capture_vma(struct intel_engine_capture_vma *next, |
1469 | struct i915_vma *vma, |
1470 | const char *name, |
1471 | gfp_t gfp) |
1472 | { |
1473 | if (!vma) |
1474 | return next; |
1475 | |
1476 | /* |
1477 | * If the vma isn't pinned, then the vma should be snapshotted |
1478 | * to a struct i915_vma_snapshot at command submission time. |
1479 | * Not here. |
1480 | */ |
1481 | if (GEM_WARN_ON(!i915_vma_is_pinned(vma))) |
1482 | return next; |
1483 | |
1484 | next = capture_vma_snapshot(next, vma_res: vma->resource, gfp, name); |
1485 | |
1486 | return next; |
1487 | } |
1488 | |
1489 | static struct intel_engine_capture_vma * |
1490 | capture_user(struct intel_engine_capture_vma *capture, |
1491 | const struct i915_request *rq, |
1492 | gfp_t gfp) |
1493 | { |
1494 | struct i915_capture_list *c; |
1495 | |
1496 | for (c = rq->capture_list; c; c = c->next) |
1497 | capture = capture_vma_snapshot(next: capture, vma_res: c->vma_res, gfp, |
1498 | name: "user" ); |
1499 | |
1500 | return capture; |
1501 | } |
1502 | |
1503 | static void add_vma(struct intel_engine_coredump *ee, |
1504 | struct i915_vma_coredump *vma) |
1505 | { |
1506 | if (vma) { |
1507 | vma->next = ee->vma; |
1508 | ee->vma = vma; |
1509 | } |
1510 | } |
1511 | |
1512 | static struct i915_vma_coredump * |
1513 | create_vma_coredump(const struct intel_gt *gt, struct i915_vma *vma, |
1514 | const char *name, struct i915_vma_compress *compress) |
1515 | { |
1516 | struct i915_vma_coredump *ret = NULL; |
1517 | struct i915_vma_resource *vma_res; |
1518 | bool lockdep_cookie; |
1519 | |
1520 | if (!vma) |
1521 | return NULL; |
1522 | |
1523 | vma_res = vma->resource; |
1524 | |
1525 | if (i915_vma_resource_hold(vma_res, lockdep_cookie: &lockdep_cookie)) { |
1526 | ret = i915_vma_coredump_create(gt, vma_res, compress, name); |
1527 | i915_vma_resource_unhold(vma_res, lockdep_cookie); |
1528 | } |
1529 | |
1530 | return ret; |
1531 | } |
1532 | |
1533 | static void add_vma_coredump(struct intel_engine_coredump *ee, |
1534 | const struct intel_gt *gt, |
1535 | struct i915_vma *vma, |
1536 | const char *name, |
1537 | struct i915_vma_compress *compress) |
1538 | { |
1539 | add_vma(ee, vma: create_vma_coredump(gt, vma, name, compress)); |
1540 | } |
1541 | |
1542 | struct intel_engine_coredump * |
1543 | intel_engine_coredump_alloc(struct intel_engine_cs *engine, gfp_t gfp, u32 dump_flags) |
1544 | { |
1545 | struct intel_engine_coredump *ee; |
1546 | |
1547 | ee = kzalloc(size: sizeof(*ee), flags: gfp); |
1548 | if (!ee) |
1549 | return NULL; |
1550 | |
1551 | ee->engine = engine; |
1552 | |
1553 | if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) { |
1554 | engine_record_registers(ee); |
1555 | engine_record_execlists(ee); |
1556 | } |
1557 | |
1558 | return ee; |
1559 | } |
1560 | |
1561 | static struct intel_engine_capture_vma * |
1562 | engine_coredump_add_context(struct intel_engine_coredump *ee, |
1563 | struct intel_context *ce, |
1564 | gfp_t gfp) |
1565 | { |
1566 | struct intel_engine_capture_vma *vma = NULL; |
1567 | |
1568 | ee->simulated |= record_context(e: &ee->context, ce); |
1569 | if (ee->simulated) |
1570 | return NULL; |
1571 | |
1572 | /* |
1573 | * We need to copy these to an anonymous buffer |
1574 | * as the simplest method to avoid being overwritten |
1575 | * by userspace. |
1576 | */ |
1577 | vma = capture_vma(next: vma, vma: ce->ring->vma, name: "ring" , gfp); |
1578 | vma = capture_vma(next: vma, vma: ce->state, name: "HW context" , gfp); |
1579 | |
1580 | return vma; |
1581 | } |
1582 | |
1583 | struct intel_engine_capture_vma * |
1584 | intel_engine_coredump_add_request(struct intel_engine_coredump *ee, |
1585 | struct i915_request *rq, |
1586 | gfp_t gfp) |
1587 | { |
1588 | struct intel_engine_capture_vma *vma; |
1589 | |
1590 | vma = engine_coredump_add_context(ee, ce: rq->context, gfp); |
1591 | if (!vma) |
1592 | return NULL; |
1593 | |
1594 | /* |
1595 | * We need to copy these to an anonymous buffer |
1596 | * as the simplest method to avoid being overwritten |
1597 | * by userspace. |
1598 | */ |
1599 | vma = capture_vma_snapshot(next: vma, vma_res: rq->batch_res, gfp, name: "batch" ); |
1600 | vma = capture_user(capture: vma, rq, gfp); |
1601 | |
1602 | ee->rq_head = rq->head; |
1603 | ee->rq_post = rq->postfix; |
1604 | ee->rq_tail = rq->tail; |
1605 | |
1606 | return vma; |
1607 | } |
1608 | |
1609 | void |
1610 | intel_engine_coredump_add_vma(struct intel_engine_coredump *ee, |
1611 | struct intel_engine_capture_vma *capture, |
1612 | struct i915_vma_compress *compress) |
1613 | { |
1614 | const struct intel_engine_cs *engine = ee->engine; |
1615 | |
1616 | while (capture) { |
1617 | struct intel_engine_capture_vma *this = capture; |
1618 | struct i915_vma_resource *vma_res = this->vma_res; |
1619 | |
1620 | add_vma(ee, |
1621 | vma: i915_vma_coredump_create(gt: engine->gt, vma_res, |
1622 | compress, name: this->name)); |
1623 | |
1624 | i915_vma_resource_unhold(vma_res, lockdep_cookie: this->lockdep_cookie); |
1625 | i915_vma_resource_put(vma_res); |
1626 | |
1627 | capture = this->next; |
1628 | kfree(objp: this); |
1629 | } |
1630 | |
1631 | add_vma_coredump(ee, gt: engine->gt, vma: engine->status_page.vma, |
1632 | name: "HW Status" , compress); |
1633 | |
1634 | add_vma_coredump(ee, gt: engine->gt, vma: engine->wa_ctx.vma, |
1635 | name: "WA context" , compress); |
1636 | } |
1637 | |
1638 | static struct intel_engine_coredump * |
1639 | capture_engine(struct intel_engine_cs *engine, |
1640 | struct i915_vma_compress *compress, |
1641 | u32 dump_flags) |
1642 | { |
1643 | struct intel_engine_capture_vma *capture = NULL; |
1644 | struct intel_engine_coredump *ee; |
1645 | struct intel_context *ce = NULL; |
1646 | struct i915_request *rq = NULL; |
1647 | |
1648 | ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags); |
1649 | if (!ee) |
1650 | return NULL; |
1651 | |
1652 | intel_engine_get_hung_entity(engine, ce: &ce, rq: &rq); |
1653 | if (rq && !i915_request_started(rq)) |
1654 | drm_info(&engine->gt->i915->drm, "Got hung context on %s with active request %lld:%lld [0x%04X] not yet started\n" , |
1655 | engine->name, rq->fence.context, rq->fence.seqno, ce->guc_id.id); |
1656 | |
1657 | if (rq) { |
1658 | capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); |
1659 | i915_request_put(rq); |
1660 | } else if (ce) { |
1661 | capture = engine_coredump_add_context(ee, ce, ATOMIC_MAYFAIL); |
1662 | } |
1663 | |
1664 | if (capture) { |
1665 | intel_engine_coredump_add_vma(ee, capture, compress); |
1666 | |
1667 | if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) |
1668 | intel_guc_capture_get_matching_node(gt: engine->gt, ee, ce); |
1669 | } else { |
1670 | kfree(objp: ee); |
1671 | ee = NULL; |
1672 | } |
1673 | |
1674 | return ee; |
1675 | } |
1676 | |
1677 | static void |
1678 | gt_record_engines(struct intel_gt_coredump *gt, |
1679 | intel_engine_mask_t engine_mask, |
1680 | struct i915_vma_compress *compress, |
1681 | u32 dump_flags) |
1682 | { |
1683 | struct intel_engine_cs *engine; |
1684 | enum intel_engine_id id; |
1685 | |
1686 | for_each_engine(engine, gt->_gt, id) { |
1687 | struct intel_engine_coredump *ee; |
1688 | |
1689 | /* Refill our page pool before entering atomic section */ |
1690 | pool_refill(fbatch: &compress->pool, ALLOW_FAIL); |
1691 | |
1692 | ee = capture_engine(engine, compress, dump_flags); |
1693 | if (!ee) |
1694 | continue; |
1695 | |
1696 | ee->hung = engine->mask & engine_mask; |
1697 | |
1698 | gt->simulated |= ee->simulated; |
1699 | if (ee->simulated) { |
1700 | if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) |
1701 | intel_guc_capture_free_node(ee); |
1702 | kfree(objp: ee); |
1703 | continue; |
1704 | } |
1705 | |
1706 | ee->next = gt->engine; |
1707 | gt->engine = ee; |
1708 | } |
1709 | } |
1710 | |
1711 | static void gt_record_guc_ctb(struct intel_ctb_coredump *saved, |
1712 | const struct intel_guc_ct_buffer *ctb, |
1713 | const void *blob_ptr, struct intel_guc *guc) |
1714 | { |
1715 | if (!ctb || !ctb->desc) |
1716 | return; |
1717 | |
1718 | saved->raw_status = ctb->desc->status; |
1719 | saved->raw_head = ctb->desc->head; |
1720 | saved->raw_tail = ctb->desc->tail; |
1721 | saved->head = ctb->head; |
1722 | saved->tail = ctb->tail; |
1723 | saved->size = ctb->size; |
1724 | saved->desc_offset = ((void *)ctb->desc) - blob_ptr; |
1725 | saved->cmds_offset = ((void *)ctb->cmds) - blob_ptr; |
1726 | } |
1727 | |
1728 | static struct intel_uc_coredump * |
1729 | gt_record_uc(struct intel_gt_coredump *gt, |
1730 | struct i915_vma_compress *compress) |
1731 | { |
1732 | const struct intel_uc *uc = >->_gt->uc; |
1733 | struct intel_uc_coredump *error_uc; |
1734 | |
1735 | error_uc = kzalloc(size: sizeof(*error_uc), ALLOW_FAIL); |
1736 | if (!error_uc) |
1737 | return NULL; |
1738 | |
1739 | memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); |
1740 | memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); |
1741 | |
1742 | error_uc->guc_fw.file_selected.path = kstrdup(s: uc->guc.fw.file_selected.path, ALLOW_FAIL); |
1743 | error_uc->huc_fw.file_selected.path = kstrdup(s: uc->huc.fw.file_selected.path, ALLOW_FAIL); |
1744 | error_uc->guc_fw.file_wanted.path = kstrdup(s: uc->guc.fw.file_wanted.path, ALLOW_FAIL); |
1745 | error_uc->huc_fw.file_wanted.path = kstrdup(s: uc->huc.fw.file_wanted.path, ALLOW_FAIL); |
1746 | |
1747 | /* |
1748 | * Save the GuC log and include a timestamp reference for converting the |
1749 | * log times to system times (in conjunction with the error->boottime and |
1750 | * gt->clock_frequency fields saved elsewhere). |
1751 | */ |
1752 | error_uc->guc.timestamp = intel_uncore_read(uncore: gt->_gt->uncore, GUCPMTIMESTAMP); |
1753 | error_uc->guc.vma_log = create_vma_coredump(gt: gt->_gt, vma: uc->guc.log.vma, |
1754 | name: "GuC log buffer" , compress); |
1755 | error_uc->guc.vma_ctb = create_vma_coredump(gt: gt->_gt, vma: uc->guc.ct.vma, |
1756 | name: "GuC CT buffer" , compress); |
1757 | error_uc->guc.last_fence = uc->guc.ct.requests.last_fence; |
1758 | gt_record_guc_ctb(saved: error_uc->guc.ctb + 0, ctb: &uc->guc.ct.ctbs.send, |
1759 | blob_ptr: uc->guc.ct.ctbs.send.desc, guc: (struct intel_guc *)&uc->guc); |
1760 | gt_record_guc_ctb(saved: error_uc->guc.ctb + 1, ctb: &uc->guc.ct.ctbs.recv, |
1761 | blob_ptr: uc->guc.ct.ctbs.send.desc, guc: (struct intel_guc *)&uc->guc); |
1762 | |
1763 | return error_uc; |
1764 | } |
1765 | |
1766 | /* Capture display registers. */ |
1767 | static void gt_record_display_regs(struct intel_gt_coredump *gt) |
1768 | { |
1769 | struct intel_uncore *uncore = gt->_gt->uncore; |
1770 | struct drm_i915_private *i915 = uncore->i915; |
1771 | |
1772 | if (DISPLAY_VER(i915) >= 6 && DISPLAY_VER(i915) < 20) |
1773 | gt->derrmr = intel_uncore_read(uncore, DERRMR); |
1774 | |
1775 | if (GRAPHICS_VER(i915) >= 8) |
1776 | gt->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); |
1777 | else if (IS_VALLEYVIEW(i915)) |
1778 | gt->ier = intel_uncore_read(uncore, VLV_IER); |
1779 | else if (HAS_PCH_SPLIT(i915)) |
1780 | gt->ier = intel_uncore_read(uncore, DEIER); |
1781 | else if (GRAPHICS_VER(i915) == 2) |
1782 | gt->ier = intel_uncore_read16(uncore, GEN2_IER); |
1783 | else |
1784 | gt->ier = intel_uncore_read(uncore, GEN2_IER); |
1785 | } |
1786 | |
1787 | /* Capture all other registers that GuC doesn't capture. */ |
1788 | static void gt_record_global_nonguc_regs(struct intel_gt_coredump *gt) |
1789 | { |
1790 | struct intel_uncore *uncore = gt->_gt->uncore; |
1791 | struct drm_i915_private *i915 = uncore->i915; |
1792 | int i; |
1793 | |
1794 | if (IS_VALLEYVIEW(i915)) { |
1795 | gt->gtier[0] = intel_uncore_read(uncore, GTIER); |
1796 | gt->ngtier = 1; |
1797 | } else if (GRAPHICS_VER(i915) >= 11) { |
1798 | gt->gtier[0] = |
1799 | intel_uncore_read(uncore, |
1800 | GEN11_RENDER_COPY_INTR_ENABLE); |
1801 | gt->gtier[1] = |
1802 | intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); |
1803 | gt->gtier[2] = |
1804 | intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); |
1805 | gt->gtier[3] = |
1806 | intel_uncore_read(uncore, |
1807 | GEN11_GPM_WGBOXPERF_INTR_ENABLE); |
1808 | gt->gtier[4] = |
1809 | intel_uncore_read(uncore, |
1810 | GEN11_CRYPTO_RSVD_INTR_ENABLE); |
1811 | gt->gtier[5] = |
1812 | intel_uncore_read(uncore, |
1813 | GEN11_GUNIT_CSME_INTR_ENABLE); |
1814 | gt->ngtier = 6; |
1815 | } else if (GRAPHICS_VER(i915) >= 8) { |
1816 | for (i = 0; i < 4; i++) |
1817 | gt->gtier[i] = |
1818 | intel_uncore_read(uncore, GEN8_GT_IER(i)); |
1819 | gt->ngtier = 4; |
1820 | } else if (HAS_PCH_SPLIT(i915)) { |
1821 | gt->gtier[0] = intel_uncore_read(uncore, GTIER); |
1822 | gt->ngtier = 1; |
1823 | } |
1824 | |
1825 | gt->eir = intel_uncore_read(uncore, EIR); |
1826 | gt->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); |
1827 | } |
1828 | |
1829 | /* |
1830 | * Capture all registers that relate to workload submission. |
1831 | * NOTE: In GuC submission, when GuC resets an engine, it can dump these for us |
1832 | */ |
1833 | static void gt_record_global_regs(struct intel_gt_coredump *gt) |
1834 | { |
1835 | struct intel_uncore *uncore = gt->_gt->uncore; |
1836 | struct drm_i915_private *i915 = uncore->i915; |
1837 | int i; |
1838 | |
1839 | /* |
1840 | * General organization |
1841 | * 1. Registers specific to a single generation |
1842 | * 2. Registers which belong to multiple generations |
1843 | * 3. Feature specific registers. |
1844 | * 4. Everything else |
1845 | * Please try to follow the order. |
1846 | */ |
1847 | |
1848 | /* 1: Registers specific to a single generation */ |
1849 | if (IS_VALLEYVIEW(i915)) |
1850 | gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); |
1851 | |
1852 | if (GRAPHICS_VER(i915) == 7) |
1853 | gt->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); |
1854 | |
1855 | if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 50)) { |
1856 | gt->fault_data0 = intel_gt_mcr_read_any(gt: (struct intel_gt *)gt->_gt, |
1857 | XEHP_FAULT_TLB_DATA0); |
1858 | gt->fault_data1 = intel_gt_mcr_read_any(gt: (struct intel_gt *)gt->_gt, |
1859 | XEHP_FAULT_TLB_DATA1); |
1860 | } else if (GRAPHICS_VER(i915) >= 12) { |
1861 | gt->fault_data0 = intel_uncore_read(uncore, |
1862 | GEN12_FAULT_TLB_DATA0); |
1863 | gt->fault_data1 = intel_uncore_read(uncore, |
1864 | GEN12_FAULT_TLB_DATA1); |
1865 | } else if (GRAPHICS_VER(i915) >= 8) { |
1866 | gt->fault_data0 = intel_uncore_read(uncore, |
1867 | GEN8_FAULT_TLB_DATA0); |
1868 | gt->fault_data1 = intel_uncore_read(uncore, |
1869 | GEN8_FAULT_TLB_DATA1); |
1870 | } |
1871 | |
1872 | if (GRAPHICS_VER(i915) == 6) { |
1873 | gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); |
1874 | gt->gab_ctl = intel_uncore_read(uncore, GAB_CTL); |
1875 | gt->gfx_mode = intel_uncore_read(uncore, GFX_MODE); |
1876 | } |
1877 | |
1878 | /* 2: Registers which belong to multiple generations */ |
1879 | if (GRAPHICS_VER(i915) >= 7) |
1880 | gt->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); |
1881 | |
1882 | if (GRAPHICS_VER(i915) >= 6) { |
1883 | if (GRAPHICS_VER(i915) < 12) { |
1884 | gt->error = intel_uncore_read(uncore, ERROR_GEN6); |
1885 | gt->done_reg = intel_uncore_read(uncore, DONE_REG); |
1886 | } |
1887 | } |
1888 | |
1889 | /* 3: Feature specific registers */ |
1890 | if (IS_GRAPHICS_VER(i915, 6, 7)) { |
1891 | gt->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); |
1892 | gt->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); |
1893 | } |
1894 | |
1895 | if (IS_GRAPHICS_VER(i915, 8, 11)) |
1896 | gt->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); |
1897 | |
1898 | if (GRAPHICS_VER(i915) == 12) |
1899 | gt->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); |
1900 | |
1901 | if (GRAPHICS_VER(i915) >= 12) { |
1902 | for (i = 0; i < I915_MAX_SFC; i++) { |
1903 | /* |
1904 | * SFC_DONE resides in the VD forcewake domain, so it |
1905 | * only exists if the corresponding VCS engine is |
1906 | * present. |
1907 | */ |
1908 | if ((gt->_gt->info.sfc_mask & BIT(i)) == 0 || |
1909 | !HAS_ENGINE(gt->_gt, _VCS(i * 2))) |
1910 | continue; |
1911 | |
1912 | gt->sfc_done[i] = |
1913 | intel_uncore_read(uncore, GEN12_SFC_DONE(i)); |
1914 | } |
1915 | |
1916 | gt->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); |
1917 | } |
1918 | } |
1919 | |
1920 | static void gt_record_info(struct intel_gt_coredump *gt) |
1921 | { |
1922 | memcpy(>->info, >->_gt->info, sizeof(struct intel_gt_info)); |
1923 | gt->clock_frequency = gt->_gt->clock_frequency; |
1924 | gt->clock_period_ns = gt->_gt->clock_period_ns; |
1925 | } |
1926 | |
1927 | /* |
1928 | * Generate a semi-unique error code. The code is not meant to have meaning, The |
1929 | * code's only purpose is to try to prevent false duplicated bug reports by |
1930 | * grossly estimating a GPU error state. |
1931 | * |
1932 | * TODO Ideally, hashing the batchbuffer would be a very nice way to determine |
1933 | * the hang if we could strip the GTT offset information from it. |
1934 | * |
1935 | * It's only a small step better than a random number in its current form. |
1936 | */ |
1937 | static u32 generate_ecode(const struct intel_engine_coredump *ee) |
1938 | { |
1939 | /* |
1940 | * IPEHR would be an ideal way to detect errors, as it's the gross |
1941 | * measure of "the command that hung." However, has some very common |
1942 | * synchronization commands which almost always appear in the case |
1943 | * strictly a client bug. Use instdone to differentiate those some. |
1944 | */ |
1945 | return ee ? ee->ipehr ^ ee->instdone.instdone : 0; |
1946 | } |
1947 | |
1948 | static const char *error_msg(struct i915_gpu_coredump *error) |
1949 | { |
1950 | struct intel_engine_coredump *first = NULL; |
1951 | unsigned int hung_classes = 0; |
1952 | struct intel_gt_coredump *gt; |
1953 | int len; |
1954 | |
1955 | for (gt = error->gt; gt; gt = gt->next) { |
1956 | struct intel_engine_coredump *cs; |
1957 | |
1958 | for (cs = gt->engine; cs; cs = cs->next) { |
1959 | if (cs->hung) { |
1960 | hung_classes |= BIT(cs->engine->uabi_class); |
1961 | if (!first) |
1962 | first = cs; |
1963 | } |
1964 | } |
1965 | } |
1966 | |
1967 | len = scnprintf(buf: error->error_msg, size: sizeof(error->error_msg), |
1968 | fmt: "GPU HANG: ecode %d:%x:%08x" , |
1969 | GRAPHICS_VER(error->i915), hung_classes, |
1970 | generate_ecode(ee: first)); |
1971 | if (first && first->context.pid) { |
1972 | /* Just show the first executing process, more is confusing */ |
1973 | len += scnprintf(buf: error->error_msg + len, |
1974 | size: sizeof(error->error_msg) - len, |
1975 | fmt: ", in %s [%d]" , |
1976 | first->context.comm, first->context.pid); |
1977 | } |
1978 | |
1979 | return error->error_msg; |
1980 | } |
1981 | |
1982 | static void capture_gen(struct i915_gpu_coredump *error) |
1983 | { |
1984 | struct drm_i915_private *i915 = error->i915; |
1985 | |
1986 | error->wakelock = atomic_read(v: &i915->runtime_pm.wakeref_count); |
1987 | error->suspended = pm_runtime_suspended(dev: i915->drm.dev); |
1988 | |
1989 | error->iommu = i915_vtd_active(i915); |
1990 | error->reset_count = i915_reset_count(error: &i915->gpu_error); |
1991 | error->suspend_count = i915->suspend_count; |
1992 | |
1993 | i915_params_copy(dest: &error->params, src: &i915->params); |
1994 | intel_display_params_copy(dest: &error->display_params); |
1995 | memcpy(&error->device_info, |
1996 | INTEL_INFO(i915), |
1997 | sizeof(error->device_info)); |
1998 | memcpy(&error->runtime_info, |
1999 | RUNTIME_INFO(i915), |
2000 | sizeof(error->runtime_info)); |
2001 | memcpy(&error->display_device_info, DISPLAY_INFO(i915), |
2002 | sizeof(error->display_device_info)); |
2003 | memcpy(&error->display_runtime_info, DISPLAY_RUNTIME_INFO(i915), |
2004 | sizeof(error->display_runtime_info)); |
2005 | error->driver_caps = i915->caps; |
2006 | } |
2007 | |
2008 | struct i915_gpu_coredump * |
2009 | i915_gpu_coredump_alloc(struct drm_i915_private *i915, gfp_t gfp) |
2010 | { |
2011 | struct i915_gpu_coredump *error; |
2012 | |
2013 | if (!i915->params.error_capture) |
2014 | return NULL; |
2015 | |
2016 | error = kzalloc(size: sizeof(*error), flags: gfp); |
2017 | if (!error) |
2018 | return NULL; |
2019 | |
2020 | kref_init(kref: &error->ref); |
2021 | error->i915 = i915; |
2022 | |
2023 | error->time = ktime_get_real(); |
2024 | error->boottime = ktime_get_boottime(); |
2025 | error->uptime = ktime_sub(ktime_get(), to_gt(i915)->last_init_time); |
2026 | error->capture = jiffies; |
2027 | |
2028 | capture_gen(error); |
2029 | |
2030 | return error; |
2031 | } |
2032 | |
2033 | #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) |
2034 | |
2035 | struct intel_gt_coredump * |
2036 | intel_gt_coredump_alloc(struct intel_gt *gt, gfp_t gfp, u32 dump_flags) |
2037 | { |
2038 | struct intel_gt_coredump *gc; |
2039 | |
2040 | gc = kzalloc(size: sizeof(*gc), flags: gfp); |
2041 | if (!gc) |
2042 | return NULL; |
2043 | |
2044 | gc->_gt = gt; |
2045 | gc->awake = intel_gt_pm_is_awake(gt); |
2046 | |
2047 | gt_record_display_regs(gt: gc); |
2048 | gt_record_global_nonguc_regs(gt: gc); |
2049 | |
2050 | /* |
2051 | * GuC dumps global, eng-class and eng-instance registers |
2052 | * (that can change as part of engine state during execution) |
2053 | * before an engine is reset due to a hung context. |
2054 | * GuC captures and reports all three groups of registers |
2055 | * together as a single set before the engine is reset. |
2056 | * Thus, if GuC triggered the context reset we retrieve |
2057 | * the register values as part of gt_record_engines. |
2058 | */ |
2059 | if (!(dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)) |
2060 | gt_record_global_regs(gt: gc); |
2061 | |
2062 | gt_record_fences(gt: gc); |
2063 | |
2064 | return gc; |
2065 | } |
2066 | |
2067 | struct i915_vma_compress * |
2068 | i915_vma_capture_prepare(struct intel_gt_coredump *gt) |
2069 | { |
2070 | struct i915_vma_compress *compress; |
2071 | |
2072 | compress = kmalloc(size: sizeof(*compress), ALLOW_FAIL); |
2073 | if (!compress) |
2074 | return NULL; |
2075 | |
2076 | if (!compress_init(c: compress)) { |
2077 | kfree(objp: compress); |
2078 | return NULL; |
2079 | } |
2080 | |
2081 | return compress; |
2082 | } |
2083 | |
2084 | void i915_vma_capture_finish(struct intel_gt_coredump *gt, |
2085 | struct i915_vma_compress *compress) |
2086 | { |
2087 | if (!compress) |
2088 | return; |
2089 | |
2090 | compress_fini(c: compress); |
2091 | kfree(objp: compress); |
2092 | } |
2093 | |
2094 | static struct i915_gpu_coredump * |
2095 | __i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) |
2096 | { |
2097 | struct drm_i915_private *i915 = gt->i915; |
2098 | struct i915_gpu_coredump *error; |
2099 | |
2100 | /* Check if GPU capture has been disabled */ |
2101 | error = READ_ONCE(i915->gpu_error.first_error); |
2102 | if (IS_ERR(ptr: error)) |
2103 | return error; |
2104 | |
2105 | error = i915_gpu_coredump_alloc(i915, ALLOW_FAIL); |
2106 | if (!error) |
2107 | return ERR_PTR(error: -ENOMEM); |
2108 | |
2109 | error->gt = intel_gt_coredump_alloc(gt, ALLOW_FAIL, dump_flags); |
2110 | if (error->gt) { |
2111 | struct i915_vma_compress *compress; |
2112 | |
2113 | compress = i915_vma_capture_prepare(gt: error->gt); |
2114 | if (!compress) { |
2115 | kfree(objp: error->gt); |
2116 | kfree(objp: error); |
2117 | return ERR_PTR(error: -ENOMEM); |
2118 | } |
2119 | |
2120 | if (INTEL_INFO(i915)->has_gt_uc) { |
2121 | error->gt->uc = gt_record_uc(gt: error->gt, compress); |
2122 | if (error->gt->uc) { |
2123 | if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE) |
2124 | error->gt->uc->guc.is_guc_capture = true; |
2125 | else |
2126 | GEM_BUG_ON(error->gt->uc->guc.is_guc_capture); |
2127 | } |
2128 | } |
2129 | |
2130 | gt_record_info(gt: error->gt); |
2131 | gt_record_engines(gt: error->gt, engine_mask, compress, dump_flags); |
2132 | |
2133 | |
2134 | i915_vma_capture_finish(gt: error->gt, compress); |
2135 | |
2136 | error->simulated |= error->gt->simulated; |
2137 | } |
2138 | |
2139 | error->overlay = intel_overlay_capture_error_state(dev_priv: i915); |
2140 | |
2141 | return error; |
2142 | } |
2143 | |
2144 | static struct i915_gpu_coredump * |
2145 | i915_gpu_coredump(struct intel_gt *gt, intel_engine_mask_t engine_mask, u32 dump_flags) |
2146 | { |
2147 | static DEFINE_MUTEX(capture_mutex); |
2148 | int ret = mutex_lock_interruptible(&capture_mutex); |
2149 | struct i915_gpu_coredump *dump; |
2150 | |
2151 | if (ret) |
2152 | return ERR_PTR(error: ret); |
2153 | |
2154 | dump = __i915_gpu_coredump(gt, engine_mask, dump_flags); |
2155 | mutex_unlock(lock: &capture_mutex); |
2156 | |
2157 | return dump; |
2158 | } |
2159 | |
2160 | void i915_error_state_store(struct i915_gpu_coredump *error) |
2161 | { |
2162 | struct drm_i915_private *i915; |
2163 | static bool warned; |
2164 | |
2165 | if (IS_ERR_OR_NULL(ptr: error)) |
2166 | return; |
2167 | |
2168 | i915 = error->i915; |
2169 | drm_info(&i915->drm, "%s\n" , error_msg(error)); |
2170 | |
2171 | if (error->simulated || |
2172 | cmpxchg(&i915->gpu_error.first_error, NULL, error)) |
2173 | return; |
2174 | |
2175 | i915_gpu_coredump_get(gpu: error); |
2176 | |
2177 | if (!xchg(&warned, true) && |
2178 | ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { |
2179 | pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n" ); |
2180 | pr_info("Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/intel/issues/new.\n" ); |
2181 | pr_info("Please see https://drm.pages.freedesktop.org/intel-docs/how-to-file-i915-bugs.html for details.\n" ); |
2182 | pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n" ); |
2183 | pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n" ); |
2184 | pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n" , |
2185 | i915->drm.primary->index); |
2186 | } |
2187 | } |
2188 | |
2189 | /** |
2190 | * i915_capture_error_state - capture an error record for later analysis |
2191 | * @gt: intel_gt which originated the hang |
2192 | * @engine_mask: hung engines |
2193 | * @dump_flags: dump flags |
2194 | * |
2195 | * Should be called when an error is detected (either a hang or an error |
2196 | * interrupt) to capture error state from the time of the error. Fills |
2197 | * out a structure which becomes available in debugfs for user level tools |
2198 | * to pick up. |
2199 | */ |
2200 | void i915_capture_error_state(struct intel_gt *gt, |
2201 | intel_engine_mask_t engine_mask, u32 dump_flags) |
2202 | { |
2203 | struct i915_gpu_coredump *error; |
2204 | |
2205 | error = i915_gpu_coredump(gt, engine_mask, dump_flags); |
2206 | if (IS_ERR(ptr: error)) { |
2207 | cmpxchg(>->i915->gpu_error.first_error, NULL, error); |
2208 | return; |
2209 | } |
2210 | |
2211 | i915_error_state_store(error); |
2212 | i915_gpu_coredump_put(gpu: error); |
2213 | } |
2214 | |
2215 | static struct i915_gpu_coredump * |
2216 | i915_first_error_state(struct drm_i915_private *i915) |
2217 | { |
2218 | struct i915_gpu_coredump *error; |
2219 | |
2220 | spin_lock_irq(lock: &i915->gpu_error.lock); |
2221 | error = i915->gpu_error.first_error; |
2222 | if (!IS_ERR_OR_NULL(ptr: error)) |
2223 | i915_gpu_coredump_get(gpu: error); |
2224 | spin_unlock_irq(lock: &i915->gpu_error.lock); |
2225 | |
2226 | return error; |
2227 | } |
2228 | |
2229 | void i915_reset_error_state(struct drm_i915_private *i915) |
2230 | { |
2231 | struct i915_gpu_coredump *error; |
2232 | |
2233 | spin_lock_irq(lock: &i915->gpu_error.lock); |
2234 | error = i915->gpu_error.first_error; |
2235 | if (error != ERR_PTR(error: -ENODEV)) /* if disabled, always disabled */ |
2236 | i915->gpu_error.first_error = NULL; |
2237 | spin_unlock_irq(lock: &i915->gpu_error.lock); |
2238 | |
2239 | if (!IS_ERR_OR_NULL(ptr: error)) |
2240 | i915_gpu_coredump_put(gpu: error); |
2241 | } |
2242 | |
2243 | void i915_disable_error_state(struct drm_i915_private *i915, int err) |
2244 | { |
2245 | spin_lock_irq(lock: &i915->gpu_error.lock); |
2246 | if (!i915->gpu_error.first_error) |
2247 | i915->gpu_error.first_error = ERR_PTR(error: err); |
2248 | spin_unlock_irq(lock: &i915->gpu_error.lock); |
2249 | } |
2250 | |
2251 | #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) |
2252 | void intel_klog_error_capture(struct intel_gt *gt, |
2253 | intel_engine_mask_t engine_mask) |
2254 | { |
2255 | static int g_count; |
2256 | struct drm_i915_private *i915 = gt->i915; |
2257 | struct i915_gpu_coredump *error; |
2258 | intel_wakeref_t wakeref; |
2259 | size_t buf_size = PAGE_SIZE * 128; |
2260 | size_t pos_err; |
2261 | char *buf, *ptr, *next; |
2262 | int l_count = g_count++; |
2263 | int line = 0; |
2264 | |
2265 | /* Can't allocate memory during a reset */ |
2266 | if (test_bit(I915_RESET_BACKOFF, >->reset.flags)) { |
2267 | drm_err(>->i915->drm, "[Capture/%d.%d] Inside GT reset, skipping error capture :(\n" , |
2268 | l_count, line++); |
2269 | return; |
2270 | } |
2271 | |
2272 | error = READ_ONCE(i915->gpu_error.first_error); |
2273 | if (error) { |
2274 | drm_err(&i915->drm, "[Capture/%d.%d] Clearing existing error capture first...\n" , |
2275 | l_count, line++); |
2276 | i915_reset_error_state(i915); |
2277 | } |
2278 | |
2279 | with_intel_runtime_pm(&i915->runtime_pm, wakeref) |
2280 | error = i915_gpu_coredump(gt, engine_mask, CORE_DUMP_FLAG_NONE); |
2281 | |
2282 | if (IS_ERR(error)) { |
2283 | drm_err(&i915->drm, "[Capture/%d.%d] Failed to capture error capture: %ld!\n" , |
2284 | l_count, line++, PTR_ERR(error)); |
2285 | return; |
2286 | } |
2287 | |
2288 | buf = kvmalloc(buf_size, GFP_KERNEL); |
2289 | if (!buf) { |
2290 | drm_err(&i915->drm, "[Capture/%d.%d] Failed to allocate buffer for error capture!\n" , |
2291 | l_count, line++); |
2292 | i915_gpu_coredump_put(error); |
2293 | return; |
2294 | } |
2295 | |
2296 | drm_info(&i915->drm, "[Capture/%d.%d] Dumping i915 error capture for %ps...\n" , |
2297 | l_count, line++, __builtin_return_address(0)); |
2298 | |
2299 | /* Largest string length safe to print via dmesg */ |
2300 | # define MAX_CHUNK 800 |
2301 | |
2302 | pos_err = 0; |
2303 | while (1) { |
2304 | ssize_t got = i915_gpu_coredump_copy_to_buffer(error, buf, pos_err, buf_size - 1); |
2305 | |
2306 | if (got <= 0) |
2307 | break; |
2308 | |
2309 | buf[got] = 0; |
2310 | pos_err += got; |
2311 | |
2312 | ptr = buf; |
2313 | while (got > 0) { |
2314 | size_t count; |
2315 | char tag[2]; |
2316 | |
2317 | next = strnchr(ptr, got, '\n'); |
2318 | if (next) { |
2319 | count = next - ptr; |
2320 | *next = 0; |
2321 | tag[0] = '>'; |
2322 | tag[1] = '<'; |
2323 | } else { |
2324 | count = got; |
2325 | tag[0] = '}'; |
2326 | tag[1] = '{'; |
2327 | } |
2328 | |
2329 | if (count > MAX_CHUNK) { |
2330 | size_t pos; |
2331 | char *ptr2 = ptr; |
2332 | |
2333 | for (pos = MAX_CHUNK; pos < count; pos += MAX_CHUNK) { |
2334 | char chr = ptr[pos]; |
2335 | |
2336 | ptr[pos] = 0; |
2337 | drm_info(&i915->drm, "[Capture/%d.%d] }%s{\n" , |
2338 | l_count, line++, ptr2); |
2339 | ptr[pos] = chr; |
2340 | ptr2 = ptr + pos; |
2341 | |
2342 | /* |
2343 | * If spewing large amounts of data via a serial console, |
2344 | * this can be a very slow process. So be friendly and try |
2345 | * not to cause 'softlockup on CPU' problems. |
2346 | */ |
2347 | cond_resched(); |
2348 | } |
2349 | |
2350 | if (ptr2 < (ptr + count)) |
2351 | drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n" , |
2352 | l_count, line++, tag[0], ptr2, tag[1]); |
2353 | else if (tag[0] == '>') |
2354 | drm_info(&i915->drm, "[Capture/%d.%d] ><\n" , |
2355 | l_count, line++); |
2356 | } else { |
2357 | drm_info(&i915->drm, "[Capture/%d.%d] %c%s%c\n" , |
2358 | l_count, line++, tag[0], ptr, tag[1]); |
2359 | } |
2360 | |
2361 | ptr = next; |
2362 | got -= count; |
2363 | if (next) { |
2364 | ptr++; |
2365 | got--; |
2366 | } |
2367 | |
2368 | /* As above. */ |
2369 | cond_resched(); |
2370 | } |
2371 | |
2372 | if (got) |
2373 | drm_info(&i915->drm, "[Capture/%d.%d] Got %zd bytes remaining!\n" , |
2374 | l_count, line++, got); |
2375 | } |
2376 | |
2377 | kvfree(buf); |
2378 | |
2379 | drm_info(&i915->drm, "[Capture/%d.%d] Dumped %zd bytes\n" , l_count, line++, pos_err); |
2380 | } |
2381 | #endif |
2382 | |
2383 | static ssize_t gpu_state_read(struct file *file, char __user *ubuf, |
2384 | size_t count, loff_t *pos) |
2385 | { |
2386 | struct i915_gpu_coredump *error; |
2387 | ssize_t ret; |
2388 | void *buf; |
2389 | |
2390 | error = file->private_data; |
2391 | if (!error) |
2392 | return 0; |
2393 | |
2394 | /* Bounce buffer required because of kernfs __user API convenience. */ |
2395 | buf = kmalloc(size: count, GFP_KERNEL); |
2396 | if (!buf) |
2397 | return -ENOMEM; |
2398 | |
2399 | ret = i915_gpu_coredump_copy_to_buffer(error, buf, off: *pos, rem: count); |
2400 | if (ret <= 0) |
2401 | goto out; |
2402 | |
2403 | if (!copy_to_user(to: ubuf, from: buf, n: ret)) |
2404 | *pos += ret; |
2405 | else |
2406 | ret = -EFAULT; |
2407 | |
2408 | out: |
2409 | kfree(objp: buf); |
2410 | return ret; |
2411 | } |
2412 | |
2413 | static int gpu_state_release(struct inode *inode, struct file *file) |
2414 | { |
2415 | i915_gpu_coredump_put(gpu: file->private_data); |
2416 | return 0; |
2417 | } |
2418 | |
2419 | static int i915_gpu_info_open(struct inode *inode, struct file *file) |
2420 | { |
2421 | struct drm_i915_private *i915 = inode->i_private; |
2422 | struct i915_gpu_coredump *gpu; |
2423 | intel_wakeref_t wakeref; |
2424 | |
2425 | gpu = NULL; |
2426 | with_intel_runtime_pm(&i915->runtime_pm, wakeref) |
2427 | gpu = i915_gpu_coredump(gt: to_gt(i915), ALL_ENGINES, CORE_DUMP_FLAG_NONE); |
2428 | |
2429 | if (IS_ERR(ptr: gpu)) |
2430 | return PTR_ERR(ptr: gpu); |
2431 | |
2432 | file->private_data = gpu; |
2433 | return 0; |
2434 | } |
2435 | |
2436 | static const struct file_operations i915_gpu_info_fops = { |
2437 | .owner = THIS_MODULE, |
2438 | .open = i915_gpu_info_open, |
2439 | .read = gpu_state_read, |
2440 | .llseek = default_llseek, |
2441 | .release = gpu_state_release, |
2442 | }; |
2443 | |
2444 | static ssize_t |
2445 | i915_error_state_write(struct file *filp, |
2446 | const char __user *ubuf, |
2447 | size_t cnt, |
2448 | loff_t *ppos) |
2449 | { |
2450 | struct i915_gpu_coredump *error = filp->private_data; |
2451 | |
2452 | if (!error) |
2453 | return 0; |
2454 | |
2455 | drm_dbg(&error->i915->drm, "Resetting error state\n" ); |
2456 | i915_reset_error_state(i915: error->i915); |
2457 | |
2458 | return cnt; |
2459 | } |
2460 | |
2461 | static int i915_error_state_open(struct inode *inode, struct file *file) |
2462 | { |
2463 | struct i915_gpu_coredump *error; |
2464 | |
2465 | error = i915_first_error_state(i915: inode->i_private); |
2466 | if (IS_ERR(ptr: error)) |
2467 | return PTR_ERR(ptr: error); |
2468 | |
2469 | file->private_data = error; |
2470 | return 0; |
2471 | } |
2472 | |
2473 | static const struct file_operations i915_error_state_fops = { |
2474 | .owner = THIS_MODULE, |
2475 | .open = i915_error_state_open, |
2476 | .read = gpu_state_read, |
2477 | .write = i915_error_state_write, |
2478 | .llseek = default_llseek, |
2479 | .release = gpu_state_release, |
2480 | }; |
2481 | |
2482 | void i915_gpu_error_debugfs_register(struct drm_i915_private *i915) |
2483 | { |
2484 | struct drm_minor *minor = i915->drm.primary; |
2485 | |
2486 | debugfs_create_file(name: "i915_error_state" , mode: 0644, parent: minor->debugfs_root, data: i915, |
2487 | fops: &i915_error_state_fops); |
2488 | debugfs_create_file(name: "i915_gpu_info" , mode: 0644, parent: minor->debugfs_root, data: i915, |
2489 | fops: &i915_gpu_info_fops); |
2490 | } |
2491 | |
2492 | static ssize_t error_state_read(struct file *filp, struct kobject *kobj, |
2493 | struct bin_attribute *attr, char *buf, |
2494 | loff_t off, size_t count) |
2495 | { |
2496 | |
2497 | struct device *kdev = kobj_to_dev(kobj); |
2498 | struct drm_i915_private *i915 = kdev_minor_to_i915(kdev); |
2499 | struct i915_gpu_coredump *gpu; |
2500 | ssize_t ret = 0; |
2501 | |
2502 | /* |
2503 | * FIXME: Concurrent clients triggering resets and reading + clearing |
2504 | * dumps can cause inconsistent sysfs reads when a user calls in with a |
2505 | * non-zero offset to complete a prior partial read but the |
2506 | * gpu_coredump has been cleared or replaced. |
2507 | */ |
2508 | |
2509 | gpu = i915_first_error_state(i915); |
2510 | if (IS_ERR(ptr: gpu)) { |
2511 | ret = PTR_ERR(ptr: gpu); |
2512 | } else if (gpu) { |
2513 | ret = i915_gpu_coredump_copy_to_buffer(error: gpu, buf, off, rem: count); |
2514 | i915_gpu_coredump_put(gpu); |
2515 | } else { |
2516 | const char *str = "No error state collected\n" ; |
2517 | size_t len = strlen(str); |
2518 | |
2519 | if (off < len) { |
2520 | ret = min_t(size_t, count, len - off); |
2521 | memcpy(buf, str + off, ret); |
2522 | } |
2523 | } |
2524 | |
2525 | return ret; |
2526 | } |
2527 | |
2528 | static ssize_t error_state_write(struct file *file, struct kobject *kobj, |
2529 | struct bin_attribute *attr, char *buf, |
2530 | loff_t off, size_t count) |
2531 | { |
2532 | struct device *kdev = kobj_to_dev(kobj); |
2533 | struct drm_i915_private *dev_priv = kdev_minor_to_i915(kdev); |
2534 | |
2535 | drm_dbg(&dev_priv->drm, "Resetting error state\n" ); |
2536 | i915_reset_error_state(i915: dev_priv); |
2537 | |
2538 | return count; |
2539 | } |
2540 | |
2541 | static const struct bin_attribute error_state_attr = { |
2542 | .attr.name = "error" , |
2543 | .attr.mode = S_IRUSR | S_IWUSR, |
2544 | .size = 0, |
2545 | .read = error_state_read, |
2546 | .write = error_state_write, |
2547 | }; |
2548 | |
2549 | void i915_gpu_error_sysfs_setup(struct drm_i915_private *i915) |
2550 | { |
2551 | struct device *kdev = i915->drm.primary->kdev; |
2552 | |
2553 | if (sysfs_create_bin_file(kobj: &kdev->kobj, attr: &error_state_attr)) |
2554 | drm_err(&i915->drm, "error_state sysfs setup failed\n" ); |
2555 | } |
2556 | |
2557 | void i915_gpu_error_sysfs_teardown(struct drm_i915_private *i915) |
2558 | { |
2559 | struct device *kdev = i915->drm.primary->kdev; |
2560 | |
2561 | sysfs_remove_bin_file(kobj: &kdev->kobj, attr: &error_state_attr); |
2562 | } |
2563 | |