1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30#include <linux/ascii85.h>
31#include <linux/nmi.h>
32#include <linux/scatterlist.h>
33#include <linux/stop_machine.h>
34#include <linux/utsname.h>
35#include <linux/zlib.h>
36
37#include <drm/drm_print.h>
38
39#include "display/intel_atomic.h"
40#include "display/intel_overlay.h"
41
42#include "gem/i915_gem_context.h"
43
44#include "i915_drv.h"
45#include "i915_gpu_error.h"
46#include "i915_scatterlist.h"
47#include "intel_csr.h"
48
49static inline const struct intel_engine_cs *
50engine_lookup(const struct drm_i915_private *i915, unsigned int id)
51{
52 if (id >= I915_NUM_ENGINES)
53 return NULL;
54
55 return i915->engine[id];
56}
57
58static inline const char *
59__engine_name(const struct intel_engine_cs *engine)
60{
61 return engine ? engine->name : "";
62}
63
64static const char *
65engine_name(const struct drm_i915_private *i915, unsigned int id)
66{
67 return __engine_name(engine_lookup(i915, id));
68}
69
70static const char *tiling_flag(int tiling)
71{
72 switch (tiling) {
73 default:
74 case I915_TILING_NONE: return "";
75 case I915_TILING_X: return " X";
76 case I915_TILING_Y: return " Y";
77 }
78}
79
80static const char *dirty_flag(int dirty)
81{
82 return dirty ? " dirty" : "";
83}
84
85static const char *purgeable_flag(int purgeable)
86{
87 return purgeable ? " purgeable" : "";
88}
89
90static void __sg_set_buf(struct scatterlist *sg,
91 void *addr, unsigned int len, loff_t it)
92{
93 sg->page_link = (unsigned long)virt_to_page(addr);
94 sg->offset = offset_in_page(addr);
95 sg->length = len;
96 sg->dma_address = it;
97}
98
99static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
100{
101 if (!len)
102 return false;
103
104 if (e->bytes + len + 1 <= e->size)
105 return true;
106
107 if (e->bytes) {
108 __sg_set_buf(e->cur++, e->buf, e->bytes, e->iter);
109 e->iter += e->bytes;
110 e->buf = NULL;
111 e->bytes = 0;
112 }
113
114 if (e->cur == e->end) {
115 struct scatterlist *sgl;
116
117 sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
118 if (!sgl) {
119 e->err = -ENOMEM;
120 return false;
121 }
122
123 if (e->cur) {
124 e->cur->offset = 0;
125 e->cur->length = 0;
126 e->cur->page_link =
127 (unsigned long)sgl | SG_CHAIN;
128 } else {
129 e->sgl = sgl;
130 }
131
132 e->cur = sgl;
133 e->end = sgl + SG_MAX_SINGLE_ALLOC - 1;
134 }
135
136 e->size = ALIGN(len + 1, SZ_64K);
137 e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
138 if (!e->buf) {
139 e->size = PAGE_ALIGN(len + 1);
140 e->buf = kmalloc(e->size, GFP_KERNEL);
141 }
142 if (!e->buf) {
143 e->err = -ENOMEM;
144 return false;
145 }
146
147 return true;
148}
149
150__printf(2, 0)
151static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
152 const char *fmt, va_list args)
153{
154 va_list ap;
155 int len;
156
157 if (e->err)
158 return;
159
160 va_copy(ap, args);
161 len = vsnprintf(NULL, 0, fmt, ap);
162 va_end(ap);
163 if (len <= 0) {
164 e->err = len;
165 return;
166 }
167
168 if (!__i915_error_grow(e, len))
169 return;
170
171 GEM_BUG_ON(e->bytes >= e->size);
172 len = vscnprintf(e->buf + e->bytes, e->size - e->bytes, fmt, args);
173 if (len < 0) {
174 e->err = len;
175 return;
176 }
177 e->bytes += len;
178}
179
180static void i915_error_puts(struct drm_i915_error_state_buf *e, const char *str)
181{
182 unsigned len;
183
184 if (e->err || !str)
185 return;
186
187 len = strlen(str);
188 if (!__i915_error_grow(e, len))
189 return;
190
191 GEM_BUG_ON(e->bytes + len > e->size);
192 memcpy(e->buf + e->bytes, str, len);
193 e->bytes += len;
194}
195
196#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
197#define err_puts(e, s) i915_error_puts(e, s)
198
199static void __i915_printfn_error(struct drm_printer *p, struct va_format *vaf)
200{
201 i915_error_vprintf(p->arg, vaf->fmt, *vaf->va);
202}
203
204static inline struct drm_printer
205i915_error_printer(struct drm_i915_error_state_buf *e)
206{
207 struct drm_printer p = {
208 .printfn = __i915_printfn_error,
209 .arg = e,
210 };
211 return p;
212}
213
214#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
215
216struct compress {
217 struct z_stream_s zstream;
218 void *tmp;
219};
220
221static bool compress_init(struct compress *c)
222{
223 struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
224
225 zstream->workspace =
226 kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
227 GFP_ATOMIC | __GFP_NOWARN);
228 if (!zstream->workspace)
229 return false;
230
231 if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
232 kfree(zstream->workspace);
233 return false;
234 }
235
236 c->tmp = NULL;
237 if (i915_has_memcpy_from_wc())
238 c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
239
240 return true;
241}
242
243static void *compress_next_page(struct drm_i915_error_object *dst)
244{
245 unsigned long page;
246
247 if (dst->page_count >= dst->num_pages)
248 return ERR_PTR(-ENOSPC);
249
250 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
251 if (!page)
252 return ERR_PTR(-ENOMEM);
253
254 return dst->pages[dst->page_count++] = (void *)page;
255}
256
257static int compress_page(struct compress *c,
258 void *src,
259 struct drm_i915_error_object *dst)
260{
261 struct z_stream_s *zstream = &c->zstream;
262
263 zstream->next_in = src;
264 if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
265 zstream->next_in = c->tmp;
266 zstream->avail_in = PAGE_SIZE;
267
268 do {
269 if (zstream->avail_out == 0) {
270 zstream->next_out = compress_next_page(dst);
271 if (IS_ERR(zstream->next_out))
272 return PTR_ERR(zstream->next_out);
273
274 zstream->avail_out = PAGE_SIZE;
275 }
276
277 if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
278 return -EIO;
279
280 touch_nmi_watchdog();
281 } while (zstream->avail_in);
282
283
284 if (0 && zstream->total_out > zstream->total_in)
285 return -E2BIG;
286
287 return 0;
288}
289
290static int compress_flush(struct compress *c,
291 struct drm_i915_error_object *dst)
292{
293 struct z_stream_s *zstream = &c->zstream;
294
295 do {
296 switch (zlib_deflate(zstream, Z_FINISH)) {
297 case Z_OK:
298 zstream->next_out = compress_next_page(dst);
299 if (IS_ERR(zstream->next_out))
300 return PTR_ERR(zstream->next_out);
301
302 zstream->avail_out = PAGE_SIZE;
303 break;
304
305 case Z_STREAM_END:
306 goto end;
307
308 default:
309 return -EIO;
310 }
311 } while (1);
312
313end:
314 memset(zstream->next_out, 0, zstream->avail_out);
315 dst->unused = zstream->avail_out;
316 return 0;
317}
318
319static void compress_fini(struct compress *c,
320 struct drm_i915_error_object *dst)
321{
322 struct z_stream_s *zstream = &c->zstream;
323
324 zlib_deflateEnd(zstream);
325 kfree(zstream->workspace);
326 if (c->tmp)
327 free_page((unsigned long)c->tmp);
328}
329
330static void err_compression_marker(struct drm_i915_error_state_buf *m)
331{
332 err_puts(m, ":");
333}
334
335#else
336
337struct compress {
338};
339
340static bool compress_init(struct compress *c)
341{
342 return true;
343}
344
345static int compress_page(struct compress *c,
346 void *src,
347 struct drm_i915_error_object *dst)
348{
349 unsigned long page;
350 void *ptr;
351
352 page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
353 if (!page)
354 return -ENOMEM;
355
356 ptr = (void *)page;
357 if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
358 memcpy(ptr, src, PAGE_SIZE);
359 dst->pages[dst->page_count++] = ptr;
360
361 return 0;
362}
363
364static int compress_flush(struct compress *c,
365 struct drm_i915_error_object *dst)
366{
367 return 0;
368}
369
370static void compress_fini(struct compress *c,
371 struct drm_i915_error_object *dst)
372{
373}
374
375static void err_compression_marker(struct drm_i915_error_state_buf *m)
376{
377 err_puts(m, "~");
378}
379
380#endif
381
382static void print_error_buffers(struct drm_i915_error_state_buf *m,
383 const char *name,
384 struct drm_i915_error_buffer *err,
385 int count)
386{
387 err_printf(m, "%s [%d]:\n", name, count);
388
389 while (count--) {
390 err_printf(m, " %08x_%08x %8u %02x %02x",
391 upper_32_bits(err->gtt_offset),
392 lower_32_bits(err->gtt_offset),
393 err->size,
394 err->read_domains,
395 err->write_domain);
396 err_puts(m, tiling_flag(err->tiling));
397 err_puts(m, dirty_flag(err->dirty));
398 err_puts(m, purgeable_flag(err->purgeable));
399 err_puts(m, err->userptr ? " userptr" : "");
400 err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
401
402 if (err->name)
403 err_printf(m, " (name: %d)", err->name);
404 if (err->fence_reg != I915_FENCE_REG_NONE)
405 err_printf(m, " (fence: %d)", err->fence_reg);
406
407 err_puts(m, "\n");
408 err++;
409 }
410}
411
412static void error_print_instdone(struct drm_i915_error_state_buf *m,
413 const struct drm_i915_error_engine *ee)
414{
415 int slice;
416 int subslice;
417
418 err_printf(m, " INSTDONE: 0x%08x\n",
419 ee->instdone.instdone);
420
421 if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
422 return;
423
424 err_printf(m, " SC_INSTDONE: 0x%08x\n",
425 ee->instdone.slice_common);
426
427 if (INTEL_GEN(m->i915) <= 6)
428 return;
429
430 for_each_instdone_slice_subslice(m->i915, slice, subslice)
431 err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
432 slice, subslice,
433 ee->instdone.sampler[slice][subslice]);
434
435 for_each_instdone_slice_subslice(m->i915, slice, subslice)
436 err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
437 slice, subslice,
438 ee->instdone.row[slice][subslice]);
439}
440
441static void error_print_request(struct drm_i915_error_state_buf *m,
442 const char *prefix,
443 const struct drm_i915_error_request *erq,
444 const unsigned long epoch)
445{
446 if (!erq->seqno)
447 return;
448
449 err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
450 prefix, erq->pid, erq->context, erq->seqno,
451 test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
452 &erq->flags) ? "!" : "",
453 test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
454 &erq->flags) ? "+" : "",
455 erq->sched_attr.priority,
456 jiffies_to_msecs(erq->jiffies - epoch),
457 erq->start, erq->head, erq->tail);
458}
459
460static void error_print_context(struct drm_i915_error_state_buf *m,
461 const char *header,
462 const struct drm_i915_error_context *ctx)
463{
464 err_printf(m, "%s%s[%d] hw_id %d, prio %d, guilty %d active %d\n",
465 header, ctx->comm, ctx->pid, ctx->hw_id,
466 ctx->sched_attr.priority, ctx->guilty, ctx->active);
467}
468
469static void error_print_engine(struct drm_i915_error_state_buf *m,
470 const struct drm_i915_error_engine *ee,
471 const unsigned long epoch)
472{
473 int n;
474
475 err_printf(m, "%s command stream:\n",
476 engine_name(m->i915, ee->engine_id));
477 err_printf(m, " IDLE?: %s\n", yesno(ee->idle));
478 err_printf(m, " START: 0x%08x\n", ee->start);
479 err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
480 err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
481 ee->tail, ee->rq_post, ee->rq_tail);
482 err_printf(m, " CTL: 0x%08x\n", ee->ctl);
483 err_printf(m, " MODE: 0x%08x\n", ee->mode);
484 err_printf(m, " HWS: 0x%08x\n", ee->hws);
485 err_printf(m, " ACTHD: 0x%08x %08x\n",
486 (u32)(ee->acthd>>32), (u32)ee->acthd);
487 err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
488 err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
489
490 error_print_instdone(m, ee);
491
492 if (ee->batchbuffer) {
493 u64 start = ee->batchbuffer->gtt_offset;
494 u64 end = start + ee->batchbuffer->gtt_size;
495
496 err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
497 upper_32_bits(start), lower_32_bits(start),
498 upper_32_bits(end), lower_32_bits(end));
499 }
500 if (INTEL_GEN(m->i915) >= 4) {
501 err_printf(m, " BBADDR: 0x%08x_%08x\n",
502 (u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
503 err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
504 err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
505 }
506 err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
507 err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
508 lower_32_bits(ee->faddr));
509 if (INTEL_GEN(m->i915) >= 6) {
510 err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
511 err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
512 }
513 if (HAS_PPGTT(m->i915)) {
514 err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
515
516 if (INTEL_GEN(m->i915) >= 8) {
517 int i;
518 for (i = 0; i < 4; i++)
519 err_printf(m, " PDP%d: 0x%016llx\n",
520 i, ee->vm_info.pdp[i]);
521 } else {
522 err_printf(m, " PP_DIR_BASE: 0x%08x\n",
523 ee->vm_info.pp_dir_base);
524 }
525 }
526 err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
527 err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
528 err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
529 jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
530 ee->hangcheck_timestamp,
531 ee->hangcheck_timestamp == epoch ? "; epoch" : "");
532 err_printf(m, " engine reset count: %u\n", ee->reset_count);
533
534 for (n = 0; n < ee->num_ports; n++) {
535 err_printf(m, " ELSP[%d]:", n);
536 error_print_request(m, " ", &ee->execlist[n], epoch);
537 }
538
539 error_print_context(m, " Active context: ", &ee->context);
540}
541
542void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
543{
544 va_list args;
545
546 va_start(args, f);
547 i915_error_vprintf(e, f, args);
548 va_end(args);
549}
550
551static void print_error_obj(struct drm_i915_error_state_buf *m,
552 struct intel_engine_cs *engine,
553 const char *name,
554 struct drm_i915_error_object *obj)
555{
556 char out[ASCII85_BUFSZ];
557 int page;
558
559 if (!obj)
560 return;
561
562 if (name) {
563 err_printf(m, "%s --- %s = 0x%08x %08x\n",
564 engine ? engine->name : "global", name,
565 upper_32_bits(obj->gtt_offset),
566 lower_32_bits(obj->gtt_offset));
567 }
568
569 err_compression_marker(m);
570 for (page = 0; page < obj->page_count; page++) {
571 int i, len;
572
573 len = PAGE_SIZE;
574 if (page == obj->page_count - 1)
575 len -= obj->unused;
576 len = ascii85_encode_len(len);
577
578 for (i = 0; i < len; i++)
579 err_puts(m, ascii85_encode(obj->pages[page][i], out));
580 }
581 err_puts(m, "\n");
582}
583
584static void err_print_capabilities(struct drm_i915_error_state_buf *m,
585 const struct intel_device_info *info,
586 const struct intel_runtime_info *runtime,
587 const struct intel_driver_caps *caps)
588{
589 struct drm_printer p = i915_error_printer(m);
590
591 intel_device_info_dump_flags(info, &p);
592 intel_driver_caps_print(caps, &p);
593 intel_device_info_dump_topology(&runtime->sseu, &p);
594}
595
596static void err_print_params(struct drm_i915_error_state_buf *m,
597 const struct i915_params *params)
598{
599 struct drm_printer p = i915_error_printer(m);
600
601 i915_params_dump(params, &p);
602}
603
604static void err_print_pciid(struct drm_i915_error_state_buf *m,
605 struct drm_i915_private *i915)
606{
607 struct pci_dev *pdev = i915->drm.pdev;
608
609 err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
610 err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
611 err_printf(m, "PCI Subsystem: %04x:%04x\n",
612 pdev->subsystem_vendor,
613 pdev->subsystem_device);
614}
615
616static void err_print_uc(struct drm_i915_error_state_buf *m,
617 const struct i915_error_uc *error_uc)
618{
619 struct drm_printer p = i915_error_printer(m);
620 const struct i915_gpu_state *error =
621 container_of(error_uc, typeof(*error), uc);
622
623 if (!error->device_info.has_guc)
624 return;
625
626 intel_uc_fw_dump(&error_uc->guc_fw, &p);
627 intel_uc_fw_dump(&error_uc->huc_fw, &p);
628 print_error_obj(m, NULL, "GuC log buffer", error_uc->guc_log);
629}
630
631static void err_free_sgl(struct scatterlist *sgl)
632{
633 while (sgl) {
634 struct scatterlist *sg;
635
636 for (sg = sgl; !sg_is_chain(sg); sg++) {
637 kfree(sg_virt(sg));
638 if (sg_is_last(sg))
639 break;
640 }
641
642 sg = sg_is_last(sg) ? NULL : sg_chain_ptr(sg);
643 free_page((unsigned long)sgl);
644 sgl = sg;
645 }
646}
647
648static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
649 struct i915_gpu_state *error)
650{
651 struct drm_i915_error_object *obj;
652 struct timespec64 ts;
653 int i, j;
654
655 if (*error->error_msg)
656 err_printf(m, "%s\n", error->error_msg);
657 err_printf(m, "Kernel: %s %s\n",
658 init_utsname()->release,
659 init_utsname()->machine);
660 ts = ktime_to_timespec64(error->time);
661 err_printf(m, "Time: %lld s %ld us\n",
662 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
663 ts = ktime_to_timespec64(error->boottime);
664 err_printf(m, "Boottime: %lld s %ld us\n",
665 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
666 ts = ktime_to_timespec64(error->uptime);
667 err_printf(m, "Uptime: %lld s %ld us\n",
668 (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
669 err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
670 err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
671 error->capture,
672 jiffies_to_msecs(jiffies - error->capture),
673 jiffies_to_msecs(error->capture - error->epoch));
674
675 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
676 if (!error->engine[i].context.pid)
677 continue;
678
679 err_printf(m, "Active process (on ring %s): %s [%d]\n",
680 engine_name(m->i915, i),
681 error->engine[i].context.comm,
682 error->engine[i].context.pid);
683 }
684 err_printf(m, "Reset count: %u\n", error->reset_count);
685 err_printf(m, "Suspend count: %u\n", error->suspend_count);
686 err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
687 err_printf(m, "Subplatform: 0x%x\n",
688 intel_subplatform(&error->runtime_info,
689 error->device_info.platform));
690 err_print_pciid(m, m->i915);
691
692 err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
693
694 if (HAS_CSR(m->i915)) {
695 struct intel_csr *csr = &m->i915->csr;
696
697 err_printf(m, "DMC loaded: %s\n",
698 yesno(csr->dmc_payload != NULL));
699 err_printf(m, "DMC fw version: %d.%d\n",
700 CSR_VERSION_MAJOR(csr->version),
701 CSR_VERSION_MINOR(csr->version));
702 }
703
704 err_printf(m, "GT awake: %s\n", yesno(error->awake));
705 err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
706 err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
707 err_printf(m, "EIR: 0x%08x\n", error->eir);
708 err_printf(m, "IER: 0x%08x\n", error->ier);
709 for (i = 0; i < error->ngtier; i++)
710 err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
711 err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
712 err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
713 err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
714 err_printf(m, "CCID: 0x%08x\n", error->ccid);
715
716 for (i = 0; i < error->nfence; i++)
717 err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
718
719 if (INTEL_GEN(m->i915) >= 6) {
720 err_printf(m, "ERROR: 0x%08x\n", error->error);
721
722 if (INTEL_GEN(m->i915) >= 8)
723 err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
724 error->fault_data1, error->fault_data0);
725
726 err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
727 }
728
729 if (IS_GEN(m->i915, 7))
730 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
731
732 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
733 if (error->engine[i].engine_id != -1)
734 error_print_engine(m, &error->engine[i], error->epoch);
735 }
736
737 for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
738 char buf[128];
739 int len, first = 1;
740
741 if (!error->active_vm[i])
742 break;
743
744 len = scnprintf(buf, sizeof(buf), "Active (");
745 for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
746 if (error->engine[j].vm != error->active_vm[i])
747 continue;
748
749 len += scnprintf(buf + len, sizeof(buf), "%s%s",
750 first ? "" : ", ",
751 m->i915->engine[j]->name);
752 first = 0;
753 }
754 scnprintf(buf + len, sizeof(buf), ")");
755 print_error_buffers(m, buf,
756 error->active_bo[i],
757 error->active_bo_count[i]);
758 }
759
760 print_error_buffers(m, "Pinned (global)",
761 error->pinned_bo,
762 error->pinned_bo_count);
763
764 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
765 const struct drm_i915_error_engine *ee = &error->engine[i];
766
767 obj = ee->batchbuffer;
768 if (obj) {
769 err_puts(m, m->i915->engine[i]->name);
770 if (ee->context.pid)
771 err_printf(m, " (submitted by %s [%d])",
772 ee->context.comm,
773 ee->context.pid);
774 err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
775 upper_32_bits(obj->gtt_offset),
776 lower_32_bits(obj->gtt_offset));
777 print_error_obj(m, m->i915->engine[i], NULL, obj);
778 }
779
780 for (j = 0; j < ee->user_bo_count; j++)
781 print_error_obj(m, m->i915->engine[i],
782 "user", ee->user_bo[j]);
783
784 if (ee->num_requests) {
785 err_printf(m, "%s --- %d requests\n",
786 m->i915->engine[i]->name,
787 ee->num_requests);
788 for (j = 0; j < ee->num_requests; j++)
789 error_print_request(m, " ",
790 &ee->requests[j],
791 error->epoch);
792 }
793
794 print_error_obj(m, m->i915->engine[i],
795 "ringbuffer", ee->ringbuffer);
796
797 print_error_obj(m, m->i915->engine[i],
798 "HW Status", ee->hws_page);
799
800 print_error_obj(m, m->i915->engine[i],
801 "HW context", ee->ctx);
802
803 print_error_obj(m, m->i915->engine[i],
804 "WA context", ee->wa_ctx);
805
806 print_error_obj(m, m->i915->engine[i],
807 "WA batchbuffer", ee->wa_batchbuffer);
808
809 print_error_obj(m, m->i915->engine[i],
810 "NULL context", ee->default_state);
811 }
812
813 if (error->overlay)
814 intel_overlay_print_error_state(m, error->overlay);
815
816 if (error->display)
817 intel_display_print_error_state(m, error->display);
818
819 err_print_capabilities(m, &error->device_info, &error->runtime_info,
820 &error->driver_caps);
821 err_print_params(m, &error->params);
822 err_print_uc(m, &error->uc);
823}
824
825static int err_print_to_sgl(struct i915_gpu_state *error)
826{
827 struct drm_i915_error_state_buf m;
828
829 if (IS_ERR(error))
830 return PTR_ERR(error);
831
832 if (READ_ONCE(error->sgl))
833 return 0;
834
835 memset(&m, 0, sizeof(m));
836 m.i915 = error->i915;
837
838 __err_print_to_sgl(&m, error);
839
840 if (m.buf) {
841 __sg_set_buf(m.cur++, m.buf, m.bytes, m.iter);
842 m.bytes = 0;
843 m.buf = NULL;
844 }
845 if (m.cur) {
846 GEM_BUG_ON(m.end < m.cur);
847 sg_mark_end(m.cur - 1);
848 }
849 GEM_BUG_ON(m.sgl && !m.cur);
850
851 if (m.err) {
852 err_free_sgl(m.sgl);
853 return m.err;
854 }
855
856 if (cmpxchg(&error->sgl, NULL, m.sgl))
857 err_free_sgl(m.sgl);
858
859 return 0;
860}
861
862ssize_t i915_gpu_state_copy_to_buffer(struct i915_gpu_state *error,
863 char *buf, loff_t off, size_t rem)
864{
865 struct scatterlist *sg;
866 size_t count;
867 loff_t pos;
868 int err;
869
870 if (!error || !rem)
871 return 0;
872
873 err = err_print_to_sgl(error);
874 if (err)
875 return err;
876
877 sg = READ_ONCE(error->fit);
878 if (!sg || off < sg->dma_address)
879 sg = error->sgl;
880 if (!sg)
881 return 0;
882
883 pos = sg->dma_address;
884 count = 0;
885 do {
886 size_t len, start;
887
888 if (sg_is_chain(sg)) {
889 sg = sg_chain_ptr(sg);
890 GEM_BUG_ON(sg_is_chain(sg));
891 }
892
893 len = sg->length;
894 if (pos + len <= off) {
895 pos += len;
896 continue;
897 }
898
899 start = sg->offset;
900 if (pos < off) {
901 GEM_BUG_ON(off - pos > len);
902 len -= off - pos;
903 start += off - pos;
904 pos = off;
905 }
906
907 len = min(len, rem);
908 GEM_BUG_ON(!len || len > sg->length);
909
910 memcpy(buf, page_address(sg_page(sg)) + start, len);
911
912 count += len;
913 pos += len;
914
915 buf += len;
916 rem -= len;
917 if (!rem) {
918 WRITE_ONCE(error->fit, sg);
919 break;
920 }
921 } while (!sg_is_last(sg++));
922
923 return count;
924}
925
926static void i915_error_object_free(struct drm_i915_error_object *obj)
927{
928 int page;
929
930 if (obj == NULL)
931 return;
932
933 for (page = 0; page < obj->page_count; page++)
934 free_page((unsigned long)obj->pages[page]);
935
936 kfree(obj);
937}
938
939
940static void cleanup_params(struct i915_gpu_state *error)
941{
942 i915_params_free(&error->params);
943}
944
945static void cleanup_uc_state(struct i915_gpu_state *error)
946{
947 struct i915_error_uc *error_uc = &error->uc;
948
949 kfree(error_uc->guc_fw.path);
950 kfree(error_uc->huc_fw.path);
951 i915_error_object_free(error_uc->guc_log);
952}
953
954void __i915_gpu_state_free(struct kref *error_ref)
955{
956 struct i915_gpu_state *error =
957 container_of(error_ref, typeof(*error), ref);
958 long i, j;
959
960 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
961 struct drm_i915_error_engine *ee = &error->engine[i];
962
963 for (j = 0; j < ee->user_bo_count; j++)
964 i915_error_object_free(ee->user_bo[j]);
965 kfree(ee->user_bo);
966
967 i915_error_object_free(ee->batchbuffer);
968 i915_error_object_free(ee->wa_batchbuffer);
969 i915_error_object_free(ee->ringbuffer);
970 i915_error_object_free(ee->hws_page);
971 i915_error_object_free(ee->ctx);
972 i915_error_object_free(ee->wa_ctx);
973
974 kfree(ee->requests);
975 }
976
977 for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
978 kfree(error->active_bo[i]);
979 kfree(error->pinned_bo);
980
981 kfree(error->overlay);
982 kfree(error->display);
983
984 cleanup_params(error);
985 cleanup_uc_state(error);
986
987 err_free_sgl(error->sgl);
988 kfree(error);
989}
990
991static struct drm_i915_error_object *
992i915_error_object_create(struct drm_i915_private *i915,
993 struct i915_vma *vma)
994{
995 struct i915_ggtt *ggtt = &i915->ggtt;
996 const u64 slot = ggtt->error_capture.start;
997 struct drm_i915_error_object *dst;
998 struct compress compress;
999 unsigned long num_pages;
1000 struct sgt_iter iter;
1001 dma_addr_t dma;
1002 int ret;
1003
1004 if (!vma || !vma->pages)
1005 return NULL;
1006
1007 num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
1008 num_pages = DIV_ROUND_UP(10 * num_pages, 8);
1009 dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
1010 GFP_ATOMIC | __GFP_NOWARN);
1011 if (!dst)
1012 return NULL;
1013
1014 dst->gtt_offset = vma->node.start;
1015 dst->gtt_size = vma->node.size;
1016 dst->num_pages = num_pages;
1017 dst->page_count = 0;
1018 dst->unused = 0;
1019
1020 if (!compress_init(&compress)) {
1021 kfree(dst);
1022 return NULL;
1023 }
1024
1025 ret = -EINVAL;
1026 for_each_sgt_dma(dma, iter, vma->pages) {
1027 void __iomem *s;
1028
1029 ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
1030
1031 s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
1032 ret = compress_page(&compress, (void __force *)s, dst);
1033 io_mapping_unmap_atomic(s);
1034 if (ret)
1035 break;
1036 }
1037
1038 if (ret || compress_flush(&compress, dst)) {
1039 while (dst->page_count--)
1040 free_page((unsigned long)dst->pages[dst->page_count]);
1041 kfree(dst);
1042 dst = NULL;
1043 }
1044
1045 compress_fini(&compress, dst);
1046 return dst;
1047}
1048
1049static void capture_bo(struct drm_i915_error_buffer *err,
1050 struct i915_vma *vma)
1051{
1052 struct drm_i915_gem_object *obj = vma->obj;
1053
1054 err->size = obj->base.size;
1055 err->name = obj->base.name;
1056
1057 err->gtt_offset = vma->node.start;
1058 err->read_domains = obj->read_domains;
1059 err->write_domain = obj->write_domain;
1060 err->fence_reg = vma->fence ? vma->fence->id : -1;
1061 err->tiling = i915_gem_object_get_tiling(obj);
1062 err->dirty = obj->mm.dirty;
1063 err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
1064 err->userptr = obj->userptr.mm != NULL;
1065 err->cache_level = obj->cache_level;
1066}
1067
1068static u32 capture_error_bo(struct drm_i915_error_buffer *err,
1069 int count, struct list_head *head,
1070 unsigned int flags)
1071#define ACTIVE_ONLY BIT(0)
1072#define PINNED_ONLY BIT(1)
1073{
1074 struct i915_vma *vma;
1075 int i = 0;
1076
1077 list_for_each_entry(vma, head, vm_link) {
1078 if (!vma->obj)
1079 continue;
1080
1081 if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
1082 continue;
1083
1084 if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
1085 continue;
1086
1087 capture_bo(err++, vma);
1088 if (++i == count)
1089 break;
1090 }
1091
1092 return i;
1093}
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105static u32 i915_error_generate_code(struct i915_gpu_state *error,
1106 intel_engine_mask_t engine_mask)
1107{
1108
1109
1110
1111
1112
1113
1114 if (engine_mask) {
1115 struct drm_i915_error_engine *ee =
1116 &error->engine[ffs(engine_mask)];
1117
1118 return ee->ipehr ^ ee->instdone.instdone;
1119 }
1120
1121 return 0;
1122}
1123
1124static void gem_record_fences(struct i915_gpu_state *error)
1125{
1126 struct drm_i915_private *dev_priv = error->i915;
1127 struct intel_uncore *uncore = &dev_priv->uncore;
1128 int i;
1129
1130 if (INTEL_GEN(dev_priv) >= 6) {
1131 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1132 error->fence[i] =
1133 intel_uncore_read64(uncore,
1134 FENCE_REG_GEN6_LO(i));
1135 } else if (INTEL_GEN(dev_priv) >= 4) {
1136 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1137 error->fence[i] =
1138 intel_uncore_read64(uncore,
1139 FENCE_REG_965_LO(i));
1140 } else {
1141 for (i = 0; i < dev_priv->ggtt.num_fences; i++)
1142 error->fence[i] =
1143 intel_uncore_read(uncore, FENCE_REG(i));
1144 }
1145 error->nfence = i;
1146}
1147
1148static void error_record_engine_registers(struct i915_gpu_state *error,
1149 struct intel_engine_cs *engine,
1150 struct drm_i915_error_engine *ee)
1151{
1152 struct drm_i915_private *dev_priv = engine->i915;
1153
1154 if (INTEL_GEN(dev_priv) >= 6) {
1155 ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
1156 if (INTEL_GEN(dev_priv) >= 8)
1157 ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
1158 else
1159 ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
1160 }
1161
1162 if (INTEL_GEN(dev_priv) >= 4) {
1163 ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
1164 ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
1165 ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
1166 ee->instps = ENGINE_READ(engine, RING_INSTPS);
1167 ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
1168 if (INTEL_GEN(dev_priv) >= 8) {
1169 ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
1170 ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
1171 }
1172 ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
1173 } else {
1174 ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
1175 ee->ipeir = ENGINE_READ(engine, IPEIR);
1176 ee->ipehr = ENGINE_READ(engine, IPEHR);
1177 }
1178
1179 intel_engine_get_instdone(engine, &ee->instdone);
1180
1181 ee->instpm = ENGINE_READ(engine, RING_INSTPM);
1182 ee->acthd = intel_engine_get_active_head(engine);
1183 ee->start = ENGINE_READ(engine, RING_START);
1184 ee->head = ENGINE_READ(engine, RING_HEAD);
1185 ee->tail = ENGINE_READ(engine, RING_TAIL);
1186 ee->ctl = ENGINE_READ(engine, RING_CTL);
1187 if (INTEL_GEN(dev_priv) > 2)
1188 ee->mode = ENGINE_READ(engine, RING_MI_MODE);
1189
1190 if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
1191 i915_reg_t mmio;
1192
1193 if (IS_GEN(dev_priv, 7)) {
1194 switch (engine->id) {
1195 default:
1196 MISSING_CASE(engine->id);
1197
1198 case RCS0:
1199 mmio = RENDER_HWS_PGA_GEN7;
1200 break;
1201 case BCS0:
1202 mmio = BLT_HWS_PGA_GEN7;
1203 break;
1204 case VCS0:
1205 mmio = BSD_HWS_PGA_GEN7;
1206 break;
1207 case VECS0:
1208 mmio = VEBOX_HWS_PGA_GEN7;
1209 break;
1210 }
1211 } else if (IS_GEN(engine->i915, 6)) {
1212 mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
1213 } else {
1214
1215 mmio = RING_HWS_PGA(engine->mmio_base);
1216 }
1217
1218 ee->hws = I915_READ(mmio);
1219 }
1220
1221 ee->idle = intel_engine_is_idle(engine);
1222 if (!ee->idle)
1223 ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
1224 ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
1225 engine);
1226
1227 if (HAS_PPGTT(dev_priv)) {
1228 int i;
1229
1230 ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
1231
1232 if (IS_GEN(dev_priv, 6)) {
1233 ee->vm_info.pp_dir_base =
1234 ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
1235 } else if (IS_GEN(dev_priv, 7)) {
1236 ee->vm_info.pp_dir_base =
1237 ENGINE_READ(engine, RING_PP_DIR_BASE);
1238 } else if (INTEL_GEN(dev_priv) >= 8) {
1239 u32 base = engine->mmio_base;
1240
1241 for (i = 0; i < 4; i++) {
1242 ee->vm_info.pdp[i] =
1243 I915_READ(GEN8_RING_PDP_UDW(base, i));
1244 ee->vm_info.pdp[i] <<= 32;
1245 ee->vm_info.pdp[i] |=
1246 I915_READ(GEN8_RING_PDP_LDW(base, i));
1247 }
1248 }
1249 }
1250}
1251
1252static void record_request(struct i915_request *request,
1253 struct drm_i915_error_request *erq)
1254{
1255 struct i915_gem_context *ctx = request->gem_context;
1256
1257 erq->flags = request->fence.flags;
1258 erq->context = request->fence.context;
1259 erq->seqno = request->fence.seqno;
1260 erq->sched_attr = request->sched.attr;
1261 erq->jiffies = request->emitted_jiffies;
1262 erq->start = i915_ggtt_offset(request->ring->vma);
1263 erq->head = request->head;
1264 erq->tail = request->tail;
1265
1266 rcu_read_lock();
1267 erq->pid = ctx->pid ? pid_nr(ctx->pid) : 0;
1268 rcu_read_unlock();
1269}
1270
1271static void engine_record_requests(struct intel_engine_cs *engine,
1272 struct i915_request *first,
1273 struct drm_i915_error_engine *ee)
1274{
1275 struct i915_request *request;
1276 int count;
1277
1278 count = 0;
1279 request = first;
1280 list_for_each_entry_from(request, &engine->active.requests, sched.link)
1281 count++;
1282 if (!count)
1283 return;
1284
1285 ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
1286 if (!ee->requests)
1287 return;
1288
1289 ee->num_requests = count;
1290
1291 count = 0;
1292 request = first;
1293 list_for_each_entry_from(request,
1294 &engine->active.requests, sched.link) {
1295 if (count >= ee->num_requests) {
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311 break;
1312 }
1313
1314 record_request(request, &ee->requests[count++]);
1315 }
1316 ee->num_requests = count;
1317}
1318
1319static void error_record_engine_execlists(struct intel_engine_cs *engine,
1320 struct drm_i915_error_engine *ee)
1321{
1322 const struct intel_engine_execlists * const execlists = &engine->execlists;
1323 unsigned int n;
1324
1325 for (n = 0; n < execlists_num_ports(execlists); n++) {
1326 struct i915_request *rq = port_request(&execlists->port[n]);
1327
1328 if (!rq)
1329 break;
1330
1331 record_request(rq, &ee->execlist[n]);
1332 }
1333
1334 ee->num_ports = n;
1335}
1336
1337static void record_context(struct drm_i915_error_context *e,
1338 struct i915_gem_context *ctx)
1339{
1340 if (ctx->pid) {
1341 struct task_struct *task;
1342
1343 rcu_read_lock();
1344 task = pid_task(ctx->pid, PIDTYPE_PID);
1345 if (task) {
1346 strcpy(e->comm, task->comm);
1347 e->pid = task->pid;
1348 }
1349 rcu_read_unlock();
1350 }
1351
1352 e->hw_id = ctx->hw_id;
1353 e->sched_attr = ctx->sched;
1354 e->guilty = atomic_read(&ctx->guilty_count);
1355 e->active = atomic_read(&ctx->active_count);
1356}
1357
1358static void request_record_user_bo(struct i915_request *request,
1359 struct drm_i915_error_engine *ee)
1360{
1361 struct i915_capture_list *c;
1362 struct drm_i915_error_object **bo;
1363 long count, max;
1364
1365 max = 0;
1366 for (c = request->capture_list; c; c = c->next)
1367 max++;
1368 if (!max)
1369 return;
1370
1371 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1372 if (!bo) {
1373
1374 max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
1375 bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
1376 }
1377 if (!bo)
1378 return;
1379
1380 count = 0;
1381 for (c = request->capture_list; c; c = c->next) {
1382 bo[count] = i915_error_object_create(request->i915, c->vma);
1383 if (!bo[count])
1384 break;
1385 if (++count == max)
1386 break;
1387 }
1388
1389 ee->user_bo = bo;
1390 ee->user_bo_count = count;
1391}
1392
1393static struct drm_i915_error_object *
1394capture_object(struct drm_i915_private *dev_priv,
1395 struct drm_i915_gem_object *obj)
1396{
1397 if (obj && i915_gem_object_has_pages(obj)) {
1398 struct i915_vma fake = {
1399 .node = { .start = U64_MAX, .size = obj->base.size },
1400 .size = obj->base.size,
1401 .pages = obj->mm.pages,
1402 .obj = obj,
1403 };
1404
1405 return i915_error_object_create(dev_priv, &fake);
1406 } else {
1407 return NULL;
1408 }
1409}
1410
1411static void gem_record_rings(struct i915_gpu_state *error)
1412{
1413 struct drm_i915_private *i915 = error->i915;
1414 struct i915_ggtt *ggtt = &i915->ggtt;
1415 int i;
1416
1417 for (i = 0; i < I915_NUM_ENGINES; i++) {
1418 struct intel_engine_cs *engine = i915->engine[i];
1419 struct drm_i915_error_engine *ee = &error->engine[i];
1420 struct i915_request *request;
1421 unsigned long flags;
1422
1423 ee->engine_id = -1;
1424
1425 if (!engine)
1426 continue;
1427
1428 ee->engine_id = i;
1429
1430 error_record_engine_registers(error, engine, ee);
1431 error_record_engine_execlists(engine, ee);
1432
1433 spin_lock_irqsave(&engine->active.lock, flags);
1434 request = intel_engine_find_active_request(engine);
1435 if (request) {
1436 struct i915_gem_context *ctx = request->gem_context;
1437 struct intel_ring *ring = request->ring;
1438
1439 ee->vm = ctx->vm ?: &ggtt->vm;
1440
1441 record_context(&ee->context, ctx);
1442
1443
1444
1445
1446
1447 ee->batchbuffer =
1448 i915_error_object_create(i915, request->batch);
1449
1450 if (HAS_BROKEN_CS_TLB(i915))
1451 ee->wa_batchbuffer =
1452 i915_error_object_create(i915,
1453 i915->gt.scratch);
1454 request_record_user_bo(request, ee);
1455
1456 ee->ctx =
1457 i915_error_object_create(i915,
1458 request->hw_context->state);
1459
1460 error->simulated |=
1461 i915_gem_context_no_error_capture(ctx);
1462
1463 ee->rq_head = request->head;
1464 ee->rq_post = request->postfix;
1465 ee->rq_tail = request->tail;
1466
1467 ee->cpu_ring_head = ring->head;
1468 ee->cpu_ring_tail = ring->tail;
1469 ee->ringbuffer =
1470 i915_error_object_create(i915, ring->vma);
1471
1472 engine_record_requests(engine, request, ee);
1473 }
1474 spin_unlock_irqrestore(&engine->active.lock, flags);
1475
1476 ee->hws_page =
1477 i915_error_object_create(i915,
1478 engine->status_page.vma);
1479
1480 ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
1481
1482 ee->default_state = capture_object(i915, engine->default_state);
1483 }
1484}
1485
1486static void gem_capture_vm(struct i915_gpu_state *error,
1487 struct i915_address_space *vm,
1488 int idx)
1489{
1490 struct drm_i915_error_buffer *active_bo;
1491 struct i915_vma *vma;
1492 int count;
1493
1494 count = 0;
1495 list_for_each_entry(vma, &vm->bound_list, vm_link)
1496 if (i915_vma_is_active(vma))
1497 count++;
1498
1499 active_bo = NULL;
1500 if (count)
1501 active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
1502 if (active_bo)
1503 count = capture_error_bo(active_bo,
1504 count, &vm->bound_list,
1505 ACTIVE_ONLY);
1506 else
1507 count = 0;
1508
1509 error->active_vm[idx] = vm;
1510 error->active_bo[idx] = active_bo;
1511 error->active_bo_count[idx] = count;
1512}
1513
1514static void capture_active_buffers(struct i915_gpu_state *error)
1515{
1516 int cnt = 0, i, j;
1517
1518 BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
1519 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
1520 BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
1521
1522
1523 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1524 struct drm_i915_error_engine *ee = &error->engine[i];
1525 bool found;
1526
1527 if (!ee->vm)
1528 continue;
1529
1530 found = false;
1531 for (j = 0; j < i && !found; j++)
1532 found = error->engine[j].vm == ee->vm;
1533 if (!found)
1534 gem_capture_vm(error, ee->vm, cnt++);
1535 }
1536}
1537
1538static void capture_pinned_buffers(struct i915_gpu_state *error)
1539{
1540 struct i915_address_space *vm = &error->i915->ggtt.vm;
1541 struct drm_i915_error_buffer *bo;
1542 struct i915_vma *vma;
1543 int count;
1544
1545 count = 0;
1546 list_for_each_entry(vma, &vm->bound_list, vm_link)
1547 count++;
1548
1549 bo = NULL;
1550 if (count)
1551 bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
1552 if (!bo)
1553 return;
1554
1555 error->pinned_bo_count =
1556 capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
1557 error->pinned_bo = bo;
1558}
1559
1560static void capture_uc_state(struct i915_gpu_state *error)
1561{
1562 struct drm_i915_private *i915 = error->i915;
1563 struct i915_error_uc *error_uc = &error->uc;
1564
1565
1566 if (!error->device_info.has_guc)
1567 return;
1568
1569 error_uc->guc_fw = i915->guc.fw;
1570 error_uc->huc_fw = i915->huc.fw;
1571
1572
1573
1574
1575
1576 error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
1577 error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
1578 error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
1579}
1580
1581
1582static void capture_reg_state(struct i915_gpu_state *error)
1583{
1584 struct drm_i915_private *i915 = error->i915;
1585 struct intel_uncore *uncore = &i915->uncore;
1586 int i;
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597 if (IS_VALLEYVIEW(i915)) {
1598 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1599 error->ier = intel_uncore_read(uncore, VLV_IER);
1600 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
1601 }
1602
1603 if (IS_GEN(i915, 7))
1604 error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
1605
1606 if (INTEL_GEN(i915) >= 8) {
1607 error->fault_data0 = intel_uncore_read(uncore,
1608 GEN8_FAULT_TLB_DATA0);
1609 error->fault_data1 = intel_uncore_read(uncore,
1610 GEN8_FAULT_TLB_DATA1);
1611 }
1612
1613 if (IS_GEN(i915, 6)) {
1614 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
1615 error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
1616 error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
1617 }
1618
1619
1620 if (INTEL_GEN(i915) >= 7)
1621 error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
1622
1623 if (INTEL_GEN(i915) >= 6) {
1624 error->derrmr = intel_uncore_read(uncore, DERRMR);
1625 error->error = intel_uncore_read(uncore, ERROR_GEN6);
1626 error->done_reg = intel_uncore_read(uncore, DONE_REG);
1627 }
1628
1629 if (INTEL_GEN(i915) >= 5)
1630 error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
1631
1632
1633 if (IS_GEN_RANGE(i915, 6, 7)) {
1634 error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
1635 error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
1636 }
1637
1638
1639 if (INTEL_GEN(i915) >= 11) {
1640 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1641 error->gtier[0] =
1642 intel_uncore_read(uncore,
1643 GEN11_RENDER_COPY_INTR_ENABLE);
1644 error->gtier[1] =
1645 intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
1646 error->gtier[2] =
1647 intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
1648 error->gtier[3] =
1649 intel_uncore_read(uncore,
1650 GEN11_GPM_WGBOXPERF_INTR_ENABLE);
1651 error->gtier[4] =
1652 intel_uncore_read(uncore,
1653 GEN11_CRYPTO_RSVD_INTR_ENABLE);
1654 error->gtier[5] =
1655 intel_uncore_read(uncore,
1656 GEN11_GUNIT_CSME_INTR_ENABLE);
1657 error->ngtier = 6;
1658 } else if (INTEL_GEN(i915) >= 8) {
1659 error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
1660 for (i = 0; i < 4; i++)
1661 error->gtier[i] = intel_uncore_read(uncore,
1662 GEN8_GT_IER(i));
1663 error->ngtier = 4;
1664 } else if (HAS_PCH_SPLIT(i915)) {
1665 error->ier = intel_uncore_read(uncore, DEIER);
1666 error->gtier[0] = intel_uncore_read(uncore, GTIER);
1667 error->ngtier = 1;
1668 } else if (IS_GEN(i915, 2)) {
1669 error->ier = intel_uncore_read16(uncore, GEN2_IER);
1670 } else if (!IS_VALLEYVIEW(i915)) {
1671 error->ier = intel_uncore_read(uncore, GEN2_IER);
1672 }
1673 error->eir = intel_uncore_read(uncore, EIR);
1674 error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
1675}
1676
1677static const char *
1678error_msg(struct i915_gpu_state *error,
1679 intel_engine_mask_t engines, const char *msg)
1680{
1681 int len;
1682 int i;
1683
1684 for (i = 0; i < ARRAY_SIZE(error->engine); i++)
1685 if (!error->engine[i].context.pid)
1686 engines &= ~BIT(i);
1687
1688 len = scnprintf(error->error_msg, sizeof(error->error_msg),
1689 "GPU HANG: ecode %d:%x:0x%08x",
1690 INTEL_GEN(error->i915), engines,
1691 i915_error_generate_code(error, engines));
1692 if (engines) {
1693
1694 i = __ffs(engines);
1695 len += scnprintf(error->error_msg + len,
1696 sizeof(error->error_msg) - len,
1697 ", in %s [%d]",
1698 error->engine[i].context.comm,
1699 error->engine[i].context.pid);
1700 }
1701 if (msg)
1702 len += scnprintf(error->error_msg + len,
1703 sizeof(error->error_msg) - len,
1704 ", %s", msg);
1705
1706 return error->error_msg;
1707}
1708
1709static void capture_gen_state(struct i915_gpu_state *error)
1710{
1711 struct drm_i915_private *i915 = error->i915;
1712
1713 error->awake = i915->gt.awake;
1714 error->wakelock = atomic_read(&i915->runtime_pm.wakeref_count);
1715 error->suspended = i915->runtime_pm.suspended;
1716
1717 error->iommu = -1;
1718#ifdef CONFIG_INTEL_IOMMU
1719 error->iommu = intel_iommu_gfx_mapped;
1720#endif
1721 error->reset_count = i915_reset_count(&i915->gpu_error);
1722 error->suspend_count = i915->suspend_count;
1723
1724 memcpy(&error->device_info,
1725 INTEL_INFO(i915),
1726 sizeof(error->device_info));
1727 memcpy(&error->runtime_info,
1728 RUNTIME_INFO(i915),
1729 sizeof(error->runtime_info));
1730 error->driver_caps = i915->caps;
1731}
1732
1733static void capture_params(struct i915_gpu_state *error)
1734{
1735 i915_params_copy(&error->params, &i915_modparams);
1736}
1737
1738static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
1739{
1740 unsigned long epoch = error->capture;
1741 int i;
1742
1743 for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
1744 const struct drm_i915_error_engine *ee = &error->engine[i];
1745
1746 if (ee->hangcheck_timestamp &&
1747 time_before(ee->hangcheck_timestamp, epoch))
1748 epoch = ee->hangcheck_timestamp;
1749 }
1750
1751 return epoch;
1752}
1753
1754static void capture_finish(struct i915_gpu_state *error)
1755{
1756 struct i915_ggtt *ggtt = &error->i915->ggtt;
1757 const u64 slot = ggtt->error_capture.start;
1758
1759 ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
1760}
1761
1762static int capture(void *data)
1763{
1764 struct i915_gpu_state *error = data;
1765
1766 error->time = ktime_get_real();
1767 error->boottime = ktime_get_boottime();
1768 error->uptime = ktime_sub(ktime_get(),
1769 error->i915->gt.last_init_time);
1770 error->capture = jiffies;
1771
1772 capture_params(error);
1773 capture_gen_state(error);
1774 capture_uc_state(error);
1775 capture_reg_state(error);
1776 gem_record_fences(error);
1777 gem_record_rings(error);
1778 capture_active_buffers(error);
1779 capture_pinned_buffers(error);
1780
1781 error->overlay = intel_overlay_capture_error_state(error->i915);
1782 error->display = intel_display_capture_error_state(error->i915);
1783
1784 error->epoch = capture_find_epoch(error);
1785
1786 capture_finish(error);
1787 return 0;
1788}
1789
1790#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
1791
1792struct i915_gpu_state *
1793i915_capture_gpu_state(struct drm_i915_private *i915)
1794{
1795 struct i915_gpu_state *error;
1796
1797
1798 error = READ_ONCE(i915->gpu_error.first_error);
1799 if (IS_ERR(error))
1800 return error;
1801
1802 error = kzalloc(sizeof(*error), GFP_ATOMIC);
1803 if (!error) {
1804 i915_disable_error_state(i915, -ENOMEM);
1805 return ERR_PTR(-ENOMEM);
1806 }
1807
1808 kref_init(&error->ref);
1809 error->i915 = i915;
1810
1811 stop_machine(capture, error, NULL);
1812
1813 return error;
1814}
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827void i915_capture_error_state(struct drm_i915_private *i915,
1828 intel_engine_mask_t engine_mask,
1829 const char *msg)
1830{
1831 static bool warned;
1832 struct i915_gpu_state *error;
1833 unsigned long flags;
1834
1835 if (!i915_modparams.error_capture)
1836 return;
1837
1838 if (READ_ONCE(i915->gpu_error.first_error))
1839 return;
1840
1841 error = i915_capture_gpu_state(i915);
1842 if (IS_ERR(error))
1843 return;
1844
1845 dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
1846
1847 if (!error->simulated) {
1848 spin_lock_irqsave(&i915->gpu_error.lock, flags);
1849 if (!i915->gpu_error.first_error) {
1850 i915->gpu_error.first_error = error;
1851 error = NULL;
1852 }
1853 spin_unlock_irqrestore(&i915->gpu_error.lock, flags);
1854 }
1855
1856 if (error) {
1857 __i915_gpu_state_free(&error->ref);
1858 return;
1859 }
1860
1861 if (!warned &&
1862 ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
1863 DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
1864 DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
1865 DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
1866 DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
1867 DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
1868 i915->drm.primary->index);
1869 warned = true;
1870 }
1871}
1872
1873struct i915_gpu_state *
1874i915_first_error_state(struct drm_i915_private *i915)
1875{
1876 struct i915_gpu_state *error;
1877
1878 spin_lock_irq(&i915->gpu_error.lock);
1879 error = i915->gpu_error.first_error;
1880 if (!IS_ERR_OR_NULL(error))
1881 i915_gpu_state_get(error);
1882 spin_unlock_irq(&i915->gpu_error.lock);
1883
1884 return error;
1885}
1886
1887void i915_reset_error_state(struct drm_i915_private *i915)
1888{
1889 struct i915_gpu_state *error;
1890
1891 spin_lock_irq(&i915->gpu_error.lock);
1892 error = i915->gpu_error.first_error;
1893 if (error != ERR_PTR(-ENODEV))
1894 i915->gpu_error.first_error = NULL;
1895 spin_unlock_irq(&i915->gpu_error.lock);
1896
1897 if (!IS_ERR_OR_NULL(error))
1898 i915_gpu_state_put(error);
1899}
1900
1901void i915_disable_error_state(struct drm_i915_private *i915, int err)
1902{
1903 spin_lock_irq(&i915->gpu_error.lock);
1904 if (!i915->gpu_error.first_error)
1905 i915->gpu_error.first_error = ERR_PTR(err);
1906 spin_unlock_irq(&i915->gpu_error.lock);
1907}
1908