1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/kthread.h>
26
27#include "../i915_selftest.h"
28
29#include "mock_context.h"
30#include "mock_drm.h"
31
32struct hang {
33 struct drm_i915_private *i915;
34 struct drm_i915_gem_object *hws;
35 struct drm_i915_gem_object *obj;
36 struct i915_gem_context *ctx;
37 u32 *seqno;
38 u32 *batch;
39};
40
41static int hang_init(struct hang *h, struct drm_i915_private *i915)
42{
43 void *vaddr;
44 int err;
45
46 memset(h, 0, sizeof(*h));
47 h->i915 = i915;
48
49 h->ctx = kernel_context(i915);
50 if (IS_ERR(h->ctx))
51 return PTR_ERR(h->ctx);
52
53 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
54 if (IS_ERR(h->hws)) {
55 err = PTR_ERR(h->hws);
56 goto err_ctx;
57 }
58
59 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
60 if (IS_ERR(h->obj)) {
61 err = PTR_ERR(h->obj);
62 goto err_hws;
63 }
64
65 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
66 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
67 if (IS_ERR(vaddr)) {
68 err = PTR_ERR(vaddr);
69 goto err_obj;
70 }
71 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
72
73 vaddr = i915_gem_object_pin_map(h->obj,
74 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
75 if (IS_ERR(vaddr)) {
76 err = PTR_ERR(vaddr);
77 goto err_unpin_hws;
78 }
79 h->batch = vaddr;
80
81 return 0;
82
83err_unpin_hws:
84 i915_gem_object_unpin_map(h->hws);
85err_obj:
86 i915_gem_object_put(h->obj);
87err_hws:
88 i915_gem_object_put(h->hws);
89err_ctx:
90 kernel_context_close(h->ctx);
91 return err;
92}
93
94static u64 hws_address(const struct i915_vma *hws,
95 const struct i915_request *rq)
96{
97 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
98}
99
100static int emit_recurse_batch(struct hang *h,
101 struct i915_request *rq)
102{
103 struct drm_i915_private *i915 = h->i915;
104 struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
105 struct i915_vma *hws, *vma;
106 unsigned int flags;
107 u32 *batch;
108 int err;
109
110 vma = i915_vma_instance(h->obj, vm, NULL);
111 if (IS_ERR(vma))
112 return PTR_ERR(vma);
113
114 hws = i915_vma_instance(h->hws, vm, NULL);
115 if (IS_ERR(hws))
116 return PTR_ERR(hws);
117
118 err = i915_vma_pin(vma, 0, 0, PIN_USER);
119 if (err)
120 return err;
121
122 err = i915_vma_pin(hws, 0, 0, PIN_USER);
123 if (err)
124 goto unpin_vma;
125
126 i915_vma_move_to_active(vma, rq, 0);
127 if (!i915_gem_object_has_active_reference(vma->obj)) {
128 i915_gem_object_get(vma->obj);
129 i915_gem_object_set_active_reference(vma->obj);
130 }
131
132 i915_vma_move_to_active(hws, rq, 0);
133 if (!i915_gem_object_has_active_reference(hws->obj)) {
134 i915_gem_object_get(hws->obj);
135 i915_gem_object_set_active_reference(hws->obj);
136 }
137
138 batch = h->batch;
139 if (INTEL_GEN(i915) >= 8) {
140 *batch++ = MI_STORE_DWORD_IMM_GEN4;
141 *batch++ = lower_32_bits(hws_address(hws, rq));
142 *batch++ = upper_32_bits(hws_address(hws, rq));
143 *batch++ = rq->fence.seqno;
144 *batch++ = MI_ARB_CHECK;
145
146 memset(batch, 0, 1024);
147 batch += 1024 / sizeof(*batch);
148
149 *batch++ = MI_ARB_CHECK;
150 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
151 *batch++ = lower_32_bits(vma->node.start);
152 *batch++ = upper_32_bits(vma->node.start);
153 } else if (INTEL_GEN(i915) >= 6) {
154 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155 *batch++ = 0;
156 *batch++ = lower_32_bits(hws_address(hws, rq));
157 *batch++ = rq->fence.seqno;
158 *batch++ = MI_ARB_CHECK;
159
160 memset(batch, 0, 1024);
161 batch += 1024 / sizeof(*batch);
162
163 *batch++ = MI_ARB_CHECK;
164 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
165 *batch++ = lower_32_bits(vma->node.start);
166 } else if (INTEL_GEN(i915) >= 4) {
167 *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
168 *batch++ = 0;
169 *batch++ = lower_32_bits(hws_address(hws, rq));
170 *batch++ = rq->fence.seqno;
171 *batch++ = MI_ARB_CHECK;
172
173 memset(batch, 0, 1024);
174 batch += 1024 / sizeof(*batch);
175
176 *batch++ = MI_ARB_CHECK;
177 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
178 *batch++ = lower_32_bits(vma->node.start);
179 } else {
180 *batch++ = MI_STORE_DWORD_IMM;
181 *batch++ = lower_32_bits(hws_address(hws, rq));
182 *batch++ = rq->fence.seqno;
183 *batch++ = MI_ARB_CHECK;
184
185 memset(batch, 0, 1024);
186 batch += 1024 / sizeof(*batch);
187
188 *batch++ = MI_ARB_CHECK;
189 *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
190 *batch++ = lower_32_bits(vma->node.start);
191 }
192 *batch++ = MI_BATCH_BUFFER_END;
193 i915_gem_chipset_flush(h->i915);
194
195 flags = 0;
196 if (INTEL_GEN(vm->i915) <= 5)
197 flags |= I915_DISPATCH_SECURE;
198
199 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
200
201 i915_vma_unpin(hws);
202unpin_vma:
203 i915_vma_unpin(vma);
204 return err;
205}
206
207static struct i915_request *
208hang_create_request(struct hang *h, struct intel_engine_cs *engine)
209{
210 struct i915_request *rq;
211 int err;
212
213 if (i915_gem_object_is_active(h->obj)) {
214 struct drm_i915_gem_object *obj;
215 void *vaddr;
216
217 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
218 if (IS_ERR(obj))
219 return ERR_CAST(obj);
220
221 vaddr = i915_gem_object_pin_map(obj,
222 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
223 if (IS_ERR(vaddr)) {
224 i915_gem_object_put(obj);
225 return ERR_CAST(vaddr);
226 }
227
228 i915_gem_object_unpin_map(h->obj);
229 i915_gem_object_put(h->obj);
230
231 h->obj = obj;
232 h->batch = vaddr;
233 }
234
235 rq = i915_request_alloc(engine, h->ctx);
236 if (IS_ERR(rq))
237 return rq;
238
239 err = emit_recurse_batch(h, rq);
240 if (err) {
241 __i915_request_add(rq, false);
242 return ERR_PTR(err);
243 }
244
245 return rq;
246}
247
248static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
249{
250 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
251}
252
253struct wedge_me {
254 struct delayed_work work;
255 struct drm_i915_private *i915;
256 const void *symbol;
257};
258
259static void wedge_me(struct work_struct *work)
260{
261 struct wedge_me *w = container_of(work, typeof(*w), work.work);
262
263 pr_err("%pS timed out, cancelling all further testing.\n",
264 w->symbol);
265 i915_gem_set_wedged(w->i915);
266}
267
268static void __init_wedge(struct wedge_me *w,
269 struct drm_i915_private *i915,
270 long timeout,
271 const void *symbol)
272{
273 w->i915 = i915;
274 w->symbol = symbol;
275
276 INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
277 schedule_delayed_work(&w->work, timeout);
278}
279
280static void __fini_wedge(struct wedge_me *w)
281{
282 cancel_delayed_work_sync(&w->work);
283 destroy_delayed_work_on_stack(&w->work);
284 w->i915 = NULL;
285}
286
287#define wedge_on_timeout(W, DEV, TIMEOUT) \
288 for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
289 (W)->i915; \
290 __fini_wedge((W)))
291
292static noinline int
293flush_test(struct drm_i915_private *i915, unsigned int flags)
294{
295 struct wedge_me w;
296
297 cond_resched();
298
299 wedge_on_timeout(&w, i915, HZ)
300 i915_gem_wait_for_idle(i915, flags);
301
302 return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
303}
304
305static void hang_fini(struct hang *h)
306{
307 *h->batch = MI_BATCH_BUFFER_END;
308 i915_gem_chipset_flush(h->i915);
309
310 i915_gem_object_unpin_map(h->obj);
311 i915_gem_object_put(h->obj);
312
313 i915_gem_object_unpin_map(h->hws);
314 i915_gem_object_put(h->hws);
315
316 kernel_context_close(h->ctx);
317
318 flush_test(h->i915, I915_WAIT_LOCKED);
319}
320
321static bool wait_for_hang(struct hang *h, struct i915_request *rq)
322{
323 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
324 rq->fence.seqno),
325 10) &&
326 wait_for(i915_seqno_passed(hws_seqno(h, rq),
327 rq->fence.seqno),
328 1000));
329}
330
331static int igt_hang_sanitycheck(void *arg)
332{
333 struct drm_i915_private *i915 = arg;
334 struct i915_request *rq;
335 struct intel_engine_cs *engine;
336 enum intel_engine_id id;
337 struct hang h;
338 int err;
339
340
341
342 mutex_lock(&i915->drm.struct_mutex);
343 err = hang_init(&h, i915);
344 if (err)
345 goto unlock;
346
347 for_each_engine(engine, i915, id) {
348 long timeout;
349
350 if (!intel_engine_can_store_dword(engine))
351 continue;
352
353 rq = hang_create_request(&h, engine);
354 if (IS_ERR(rq)) {
355 err = PTR_ERR(rq);
356 pr_err("Failed to create request for %s, err=%d\n",
357 engine->name, err);
358 goto fini;
359 }
360
361 i915_request_get(rq);
362
363 *h.batch = MI_BATCH_BUFFER_END;
364 i915_gem_chipset_flush(i915);
365
366 __i915_request_add(rq, true);
367
368 timeout = i915_request_wait(rq,
369 I915_WAIT_LOCKED,
370 MAX_SCHEDULE_TIMEOUT);
371 i915_request_put(rq);
372
373 if (timeout < 0) {
374 err = timeout;
375 pr_err("Wait for request failed on %s, err=%d\n",
376 engine->name, err);
377 goto fini;
378 }
379 }
380
381fini:
382 hang_fini(&h);
383unlock:
384 mutex_unlock(&i915->drm.struct_mutex);
385 return err;
386}
387
388static void global_reset_lock(struct drm_i915_private *i915)
389{
390 struct intel_engine_cs *engine;
391 enum intel_engine_id id;
392
393 pr_debug("%s: current gpu_error=%08lx\n",
394 __func__, i915->gpu_error.flags);
395
396 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
397 wait_event(i915->gpu_error.reset_queue,
398 !test_bit(I915_RESET_BACKOFF,
399 &i915->gpu_error.flags));
400
401 for_each_engine(engine, i915, id) {
402 while (test_and_set_bit(I915_RESET_ENGINE + id,
403 &i915->gpu_error.flags))
404 wait_on_bit(&i915->gpu_error.flags,
405 I915_RESET_ENGINE + id,
406 TASK_UNINTERRUPTIBLE);
407 }
408}
409
410static void global_reset_unlock(struct drm_i915_private *i915)
411{
412 struct intel_engine_cs *engine;
413 enum intel_engine_id id;
414
415 for_each_engine(engine, i915, id)
416 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
417
418 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
419 wake_up_all(&i915->gpu_error.reset_queue);
420}
421
422static int igt_global_reset(void *arg)
423{
424 struct drm_i915_private *i915 = arg;
425 unsigned int reset_count;
426 int err = 0;
427
428
429
430 global_reset_lock(i915);
431 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
432
433 mutex_lock(&i915->drm.struct_mutex);
434 reset_count = i915_reset_count(&i915->gpu_error);
435
436 i915_reset(i915, I915_RESET_QUIET);
437
438 if (i915_reset_count(&i915->gpu_error) == reset_count) {
439 pr_err("No GPU reset recorded!\n");
440 err = -EINVAL;
441 }
442 mutex_unlock(&i915->drm.struct_mutex);
443
444 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
445 global_reset_unlock(i915);
446
447 if (i915_terminally_wedged(&i915->gpu_error))
448 err = -EIO;
449
450 return err;
451}
452
453static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
454{
455 struct intel_engine_cs *engine;
456 enum intel_engine_id id;
457 struct hang h;
458 int err = 0;
459
460
461
462 if (!intel_has_reset_engine(i915))
463 return 0;
464
465 if (active) {
466 mutex_lock(&i915->drm.struct_mutex);
467 err = hang_init(&h, i915);
468 mutex_unlock(&i915->drm.struct_mutex);
469 if (err)
470 return err;
471 }
472
473 for_each_engine(engine, i915, id) {
474 unsigned int reset_count, reset_engine_count;
475 IGT_TIMEOUT(end_time);
476
477 if (active && !intel_engine_can_store_dword(engine))
478 continue;
479
480 reset_count = i915_reset_count(&i915->gpu_error);
481 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
482 engine);
483
484 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
485 do {
486 if (active) {
487 struct i915_request *rq;
488
489 mutex_lock(&i915->drm.struct_mutex);
490 rq = hang_create_request(&h, engine);
491 if (IS_ERR(rq)) {
492 err = PTR_ERR(rq);
493 mutex_unlock(&i915->drm.struct_mutex);
494 break;
495 }
496
497 i915_request_get(rq);
498 __i915_request_add(rq, true);
499 mutex_unlock(&i915->drm.struct_mutex);
500
501 if (!wait_for_hang(&h, rq)) {
502 struct drm_printer p = drm_info_printer(i915->drm.dev);
503
504 pr_err("%s: Failed to start request %x, at %x\n",
505 __func__, rq->fence.seqno, hws_seqno(&h, rq));
506 intel_engine_dump(engine, &p,
507 "%s\n", engine->name);
508
509 i915_request_put(rq);
510 err = -EIO;
511 break;
512 }
513
514 i915_request_put(rq);
515 }
516
517 engine->hangcheck.stalled = true;
518 engine->hangcheck.seqno =
519 intel_engine_get_seqno(engine);
520
521 err = i915_reset_engine(engine, I915_RESET_QUIET);
522 if (err) {
523 pr_err("i915_reset_engine failed\n");
524 break;
525 }
526
527 if (i915_reset_count(&i915->gpu_error) != reset_count) {
528 pr_err("Full GPU reset recorded! (engine reset expected)\n");
529 err = -EINVAL;
530 break;
531 }
532
533 reset_engine_count += active;
534 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
535 reset_engine_count) {
536 pr_err("%s engine reset %srecorded!\n",
537 engine->name, active ? "not " : "");
538 err = -EINVAL;
539 break;
540 }
541
542 engine->hangcheck.stalled = false;
543 } while (time_before(jiffies, end_time));
544 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
545
546 if (err)
547 break;
548
549 err = flush_test(i915, 0);
550 if (err)
551 break;
552 }
553
554 if (i915_terminally_wedged(&i915->gpu_error))
555 err = -EIO;
556
557 if (active) {
558 mutex_lock(&i915->drm.struct_mutex);
559 hang_fini(&h);
560 mutex_unlock(&i915->drm.struct_mutex);
561 }
562
563 return err;
564}
565
566static int igt_reset_idle_engine(void *arg)
567{
568 return __igt_reset_engine(arg, false);
569}
570
571static int igt_reset_active_engine(void *arg)
572{
573 return __igt_reset_engine(arg, true);
574}
575
576static int active_engine(void *data)
577{
578 struct intel_engine_cs *engine = data;
579 struct i915_request *rq[2] = {};
580 struct i915_gem_context *ctx[2];
581 struct drm_file *file;
582 unsigned long count = 0;
583 int err = 0;
584
585 file = mock_file(engine->i915);
586 if (IS_ERR(file))
587 return PTR_ERR(file);
588
589 mutex_lock(&engine->i915->drm.struct_mutex);
590 ctx[0] = live_context(engine->i915, file);
591 mutex_unlock(&engine->i915->drm.struct_mutex);
592 if (IS_ERR(ctx[0])) {
593 err = PTR_ERR(ctx[0]);
594 goto err_file;
595 }
596
597 mutex_lock(&engine->i915->drm.struct_mutex);
598 ctx[1] = live_context(engine->i915, file);
599 mutex_unlock(&engine->i915->drm.struct_mutex);
600 if (IS_ERR(ctx[1])) {
601 err = PTR_ERR(ctx[1]);
602 i915_gem_context_put(ctx[0]);
603 goto err_file;
604 }
605
606 while (!kthread_should_stop()) {
607 unsigned int idx = count++ & 1;
608 struct i915_request *old = rq[idx];
609 struct i915_request *new;
610
611 mutex_lock(&engine->i915->drm.struct_mutex);
612 new = i915_request_alloc(engine, ctx[idx]);
613 if (IS_ERR(new)) {
614 mutex_unlock(&engine->i915->drm.struct_mutex);
615 err = PTR_ERR(new);
616 break;
617 }
618
619 rq[idx] = i915_request_get(new);
620 i915_request_add(new);
621 mutex_unlock(&engine->i915->drm.struct_mutex);
622
623 if (old) {
624 i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
625 i915_request_put(old);
626 }
627 }
628
629 for (count = 0; count < ARRAY_SIZE(rq); count++)
630 i915_request_put(rq[count]);
631
632err_file:
633 mock_file_free(engine->i915, file);
634 return err;
635}
636
637static int __igt_reset_engine_others(struct drm_i915_private *i915,
638 bool active)
639{
640 struct intel_engine_cs *engine, *other;
641 enum intel_engine_id id, tmp;
642 struct hang h;
643 int err = 0;
644
645
646
647
648
649 if (!intel_has_reset_engine(i915))
650 return 0;
651
652 if (active) {
653 mutex_lock(&i915->drm.struct_mutex);
654 err = hang_init(&h, i915);
655 mutex_unlock(&i915->drm.struct_mutex);
656 if (err)
657 return err;
658 }
659
660 for_each_engine(engine, i915, id) {
661 struct task_struct *threads[I915_NUM_ENGINES] = {};
662 unsigned long resets[I915_NUM_ENGINES];
663 unsigned long global = i915_reset_count(&i915->gpu_error);
664 unsigned long count = 0;
665 IGT_TIMEOUT(end_time);
666
667 if (active && !intel_engine_can_store_dword(engine))
668 continue;
669
670 memset(threads, 0, sizeof(threads));
671 for_each_engine(other, i915, tmp) {
672 struct task_struct *tsk;
673
674 resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
675 other);
676
677 if (other == engine)
678 continue;
679
680 tsk = kthread_run(active_engine, other,
681 "igt/%s", other->name);
682 if (IS_ERR(tsk)) {
683 err = PTR_ERR(tsk);
684 goto unwind;
685 }
686
687 threads[tmp] = tsk;
688 get_task_struct(tsk);
689 }
690
691 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
692 do {
693 if (active) {
694 struct i915_request *rq;
695
696 mutex_lock(&i915->drm.struct_mutex);
697 rq = hang_create_request(&h, engine);
698 if (IS_ERR(rq)) {
699 err = PTR_ERR(rq);
700 mutex_unlock(&i915->drm.struct_mutex);
701 break;
702 }
703
704 i915_request_get(rq);
705 __i915_request_add(rq, true);
706 mutex_unlock(&i915->drm.struct_mutex);
707
708 if (!wait_for_hang(&h, rq)) {
709 struct drm_printer p = drm_info_printer(i915->drm.dev);
710
711 pr_err("%s: Failed to start request %x, at %x\n",
712 __func__, rq->fence.seqno, hws_seqno(&h, rq));
713 intel_engine_dump(engine, &p,
714 "%s\n", engine->name);
715
716 i915_request_put(rq);
717 err = -EIO;
718 break;
719 }
720
721 i915_request_put(rq);
722 }
723
724 engine->hangcheck.stalled = true;
725 engine->hangcheck.seqno =
726 intel_engine_get_seqno(engine);
727
728 err = i915_reset_engine(engine, I915_RESET_QUIET);
729 if (err) {
730 pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
731 engine->name, active ? "active" : "idle", err);
732 break;
733 }
734
735 engine->hangcheck.stalled = false;
736 count++;
737 } while (time_before(jiffies, end_time));
738 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
740 engine->name, active ? "active" : "idle", count);
741
742 if (i915_reset_engine_count(&i915->gpu_error, engine) -
743 resets[engine->id] != (active ? count : 0)) {
744 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
745 engine->name, active ? "active" : "idle", count,
746 i915_reset_engine_count(&i915->gpu_error,
747 engine) - resets[engine->id]);
748 if (!err)
749 err = -EINVAL;
750 }
751
752unwind:
753 for_each_engine(other, i915, tmp) {
754 int ret;
755
756 if (!threads[tmp])
757 continue;
758
759 ret = kthread_stop(threads[tmp]);
760 if (ret) {
761 pr_err("kthread for other engine %s failed, err=%d\n",
762 other->name, ret);
763 if (!err)
764 err = ret;
765 }
766 put_task_struct(threads[tmp]);
767
768 if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
769 other)) {
770 pr_err("Innocent engine %s was reset (count=%ld)\n",
771 other->name,
772 i915_reset_engine_count(&i915->gpu_error,
773 other) - resets[tmp]);
774 if (!err)
775 err = -EINVAL;
776 }
777 }
778
779 if (global != i915_reset_count(&i915->gpu_error)) {
780 pr_err("Global reset (count=%ld)!\n",
781 i915_reset_count(&i915->gpu_error) - global);
782 if (!err)
783 err = -EINVAL;
784 }
785
786 if (err)
787 break;
788
789 err = flush_test(i915, 0);
790 if (err)
791 break;
792 }
793
794 if (i915_terminally_wedged(&i915->gpu_error))
795 err = -EIO;
796
797 if (active) {
798 mutex_lock(&i915->drm.struct_mutex);
799 hang_fini(&h);
800 mutex_unlock(&i915->drm.struct_mutex);
801 }
802
803 return err;
804}
805
806static int igt_reset_idle_engine_others(void *arg)
807{
808 return __igt_reset_engine_others(arg, false);
809}
810
811static int igt_reset_active_engine_others(void *arg)
812{
813 return __igt_reset_engine_others(arg, true);
814}
815
816static u32 fake_hangcheck(struct i915_request *rq)
817{
818 u32 reset_count;
819
820 rq->engine->hangcheck.stalled = true;
821 rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
822
823 reset_count = i915_reset_count(&rq->i915->gpu_error);
824
825 set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
826 wake_up_all(&rq->i915->gpu_error.wait_queue);
827
828 return reset_count;
829}
830
831static int igt_wait_reset(void *arg)
832{
833 struct drm_i915_private *i915 = arg;
834 struct i915_request *rq;
835 unsigned int reset_count;
836 struct hang h;
837 long timeout;
838 int err;
839
840 if (!intel_engine_can_store_dword(i915->engine[RCS]))
841 return 0;
842
843
844
845 global_reset_lock(i915);
846
847 mutex_lock(&i915->drm.struct_mutex);
848 err = hang_init(&h, i915);
849 if (err)
850 goto unlock;
851
852 rq = hang_create_request(&h, i915->engine[RCS]);
853 if (IS_ERR(rq)) {
854 err = PTR_ERR(rq);
855 goto fini;
856 }
857
858 i915_request_get(rq);
859 __i915_request_add(rq, true);
860
861 if (!wait_for_hang(&h, rq)) {
862 struct drm_printer p = drm_info_printer(i915->drm.dev);
863
864 pr_err("%s: Failed to start request %x, at %x\n",
865 __func__, rq->fence.seqno, hws_seqno(&h, rq));
866 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
867
868 i915_reset(i915, 0);
869 i915_gem_set_wedged(i915);
870
871 err = -EIO;
872 goto out_rq;
873 }
874
875 reset_count = fake_hangcheck(rq);
876
877 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
878 if (timeout < 0) {
879 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
880 timeout);
881 err = timeout;
882 goto out_rq;
883 }
884
885 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
886 if (i915_reset_count(&i915->gpu_error) == reset_count) {
887 pr_err("No GPU reset recorded!\n");
888 err = -EINVAL;
889 goto out_rq;
890 }
891
892out_rq:
893 i915_request_put(rq);
894fini:
895 hang_fini(&h);
896unlock:
897 mutex_unlock(&i915->drm.struct_mutex);
898 global_reset_unlock(i915);
899
900 if (i915_terminally_wedged(&i915->gpu_error))
901 return -EIO;
902
903 return err;
904}
905
906static int igt_reset_queue(void *arg)
907{
908 struct drm_i915_private *i915 = arg;
909 struct intel_engine_cs *engine;
910 enum intel_engine_id id;
911 struct hang h;
912 int err;
913
914
915
916 global_reset_lock(i915);
917
918 mutex_lock(&i915->drm.struct_mutex);
919 err = hang_init(&h, i915);
920 if (err)
921 goto unlock;
922
923 for_each_engine(engine, i915, id) {
924 struct i915_request *prev;
925 IGT_TIMEOUT(end_time);
926 unsigned int count;
927
928 if (!intel_engine_can_store_dword(engine))
929 continue;
930
931 prev = hang_create_request(&h, engine);
932 if (IS_ERR(prev)) {
933 err = PTR_ERR(prev);
934 goto fini;
935 }
936
937 i915_request_get(prev);
938 __i915_request_add(prev, true);
939
940 count = 0;
941 do {
942 struct i915_request *rq;
943 unsigned int reset_count;
944
945 rq = hang_create_request(&h, engine);
946 if (IS_ERR(rq)) {
947 err = PTR_ERR(rq);
948 goto fini;
949 }
950
951 i915_request_get(rq);
952 __i915_request_add(rq, true);
953
954 if (!wait_for_hang(&h, prev)) {
955 struct drm_printer p = drm_info_printer(i915->drm.dev);
956
957 pr_err("%s: Failed to start request %x, at %x\n",
958 __func__, prev->fence.seqno, hws_seqno(&h, prev));
959 intel_engine_dump(prev->engine, &p,
960 "%s\n", prev->engine->name);
961
962 i915_request_put(rq);
963 i915_request_put(prev);
964
965 i915_reset(i915, 0);
966 i915_gem_set_wedged(i915);
967
968 err = -EIO;
969 goto fini;
970 }
971
972 reset_count = fake_hangcheck(prev);
973
974 i915_reset(i915, I915_RESET_QUIET);
975
976 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
977 &i915->gpu_error.flags));
978
979 if (prev->fence.error != -EIO) {
980 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
981 prev->fence.error);
982 i915_request_put(rq);
983 i915_request_put(prev);
984 err = -EINVAL;
985 goto fini;
986 }
987
988 if (rq->fence.error) {
989 pr_err("Fence error status not zero [%d] after unrelated reset\n",
990 rq->fence.error);
991 i915_request_put(rq);
992 i915_request_put(prev);
993 err = -EINVAL;
994 goto fini;
995 }
996
997 if (i915_reset_count(&i915->gpu_error) == reset_count) {
998 pr_err("No GPU reset recorded!\n");
999 i915_request_put(rq);
1000 i915_request_put(prev);
1001 err = -EINVAL;
1002 goto fini;
1003 }
1004
1005 i915_request_put(prev);
1006 prev = rq;
1007 count++;
1008 } while (time_before(jiffies, end_time));
1009 pr_info("%s: Completed %d resets\n", engine->name, count);
1010
1011 *h.batch = MI_BATCH_BUFFER_END;
1012 i915_gem_chipset_flush(i915);
1013
1014 i915_request_put(prev);
1015
1016 err = flush_test(i915, I915_WAIT_LOCKED);
1017 if (err)
1018 break;
1019 }
1020
1021fini:
1022 hang_fini(&h);
1023unlock:
1024 mutex_unlock(&i915->drm.struct_mutex);
1025 global_reset_unlock(i915);
1026
1027 if (i915_terminally_wedged(&i915->gpu_error))
1028 return -EIO;
1029
1030 return err;
1031}
1032
1033static int igt_handle_error(void *arg)
1034{
1035 struct drm_i915_private *i915 = arg;
1036 struct intel_engine_cs *engine = i915->engine[RCS];
1037 struct hang h;
1038 struct i915_request *rq;
1039 struct i915_gpu_state *error;
1040 int err;
1041
1042
1043
1044 if (!intel_has_reset_engine(i915))
1045 return 0;
1046
1047 if (!intel_engine_can_store_dword(i915->engine[RCS]))
1048 return 0;
1049
1050 mutex_lock(&i915->drm.struct_mutex);
1051
1052 err = hang_init(&h, i915);
1053 if (err)
1054 goto err_unlock;
1055
1056 rq = hang_create_request(&h, engine);
1057 if (IS_ERR(rq)) {
1058 err = PTR_ERR(rq);
1059 goto err_fini;
1060 }
1061
1062 i915_request_get(rq);
1063 __i915_request_add(rq, true);
1064
1065 if (!wait_for_hang(&h, rq)) {
1066 struct drm_printer p = drm_info_printer(i915->drm.dev);
1067
1068 pr_err("%s: Failed to start request %x, at %x\n",
1069 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1070 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1071
1072 i915_reset(i915, 0);
1073 i915_gem_set_wedged(i915);
1074
1075 err = -EIO;
1076 goto err_request;
1077 }
1078
1079 mutex_unlock(&i915->drm.struct_mutex);
1080
1081
1082 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1083
1084 engine->hangcheck.stalled = true;
1085 engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1086
1087 i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1088
1089 xchg(&i915->gpu_error.first_error, error);
1090
1091 mutex_lock(&i915->drm.struct_mutex);
1092
1093 if (rq->fence.error != -EIO) {
1094 pr_err("Guilty request not identified!\n");
1095 err = -EINVAL;
1096 goto err_request;
1097 }
1098
1099err_request:
1100 i915_request_put(rq);
1101err_fini:
1102 hang_fini(&h);
1103err_unlock:
1104 mutex_unlock(&i915->drm.struct_mutex);
1105 return err;
1106}
1107
1108int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1109{
1110 static const struct i915_subtest tests[] = {
1111 SUBTEST(igt_global_reset),
1112 SUBTEST(igt_hang_sanitycheck),
1113 SUBTEST(igt_reset_idle_engine),
1114 SUBTEST(igt_reset_active_engine),
1115 SUBTEST(igt_reset_idle_engine_others),
1116 SUBTEST(igt_reset_active_engine_others),
1117 SUBTEST(igt_wait_reset),
1118 SUBTEST(igt_reset_queue),
1119 SUBTEST(igt_handle_error),
1120 };
1121 bool saved_hangcheck;
1122 int err;
1123
1124 if (!intel_has_gpu_reset(i915))
1125 return 0;
1126
1127 intel_runtime_pm_get(i915);
1128 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1129
1130 err = i915_subtests(tests, i915);
1131
1132 i915_modparams.enable_hangcheck = saved_hangcheck;
1133 intel_runtime_pm_put(i915);
1134
1135 return err;
1136}
1137