1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/kthread.h>
26
27#include "../i915_selftest.h"
28#include "i915_random.h"
29#include "igt_flush_test.h"
30#include "igt_wedge_me.h"
31
32#include "mock_context.h"
33#include "mock_drm.h"
34
35#define IGT_IDLE_TIMEOUT 50
36
37struct hang {
38 struct drm_i915_private *i915;
39 struct drm_i915_gem_object *hws;
40 struct drm_i915_gem_object *obj;
41 struct i915_gem_context *ctx;
42 u32 *seqno;
43 u32 *batch;
44};
45
46static int hang_init(struct hang *h, struct drm_i915_private *i915)
47{
48 void *vaddr;
49 int err;
50
51 memset(h, 0, sizeof(*h));
52 h->i915 = i915;
53
54 h->ctx = kernel_context(i915);
55 if (IS_ERR(h->ctx))
56 return PTR_ERR(h->ctx);
57
58 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
59 if (IS_ERR(h->hws)) {
60 err = PTR_ERR(h->hws);
61 goto err_ctx;
62 }
63
64 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
65 if (IS_ERR(h->obj)) {
66 err = PTR_ERR(h->obj);
67 goto err_hws;
68 }
69
70 i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
71 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
72 if (IS_ERR(vaddr)) {
73 err = PTR_ERR(vaddr);
74 goto err_obj;
75 }
76 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
77
78 vaddr = i915_gem_object_pin_map(h->obj,
79 HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
80 if (IS_ERR(vaddr)) {
81 err = PTR_ERR(vaddr);
82 goto err_unpin_hws;
83 }
84 h->batch = vaddr;
85
86 return 0;
87
88err_unpin_hws:
89 i915_gem_object_unpin_map(h->hws);
90err_obj:
91 i915_gem_object_put(h->obj);
92err_hws:
93 i915_gem_object_put(h->hws);
94err_ctx:
95 kernel_context_close(h->ctx);
96 return err;
97}
98
99static u64 hws_address(const struct i915_vma *hws,
100 const struct i915_request *rq)
101{
102 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
103}
104
105static int emit_recurse_batch(struct hang *h,
106 struct i915_request *rq)
107{
108 struct drm_i915_private *i915 = h->i915;
109 struct i915_address_space *vm =
110 rq->gem_context->ppgtt ?
111 &rq->gem_context->ppgtt->vm :
112 &i915->ggtt.vm;
113 struct i915_vma *hws, *vma;
114 unsigned int flags;
115 u32 *batch;
116 int err;
117
118 vma = i915_vma_instance(h->obj, vm, NULL);
119 if (IS_ERR(vma))
120 return PTR_ERR(vma);
121
122 hws = i915_vma_instance(h->hws, vm, NULL);
123 if (IS_ERR(hws))
124 return PTR_ERR(hws);
125
126 err = i915_vma_pin(vma, 0, 0, PIN_USER);
127 if (err)
128 return err;
129
130 err = i915_vma_pin(hws, 0, 0, PIN_USER);
131 if (err)
132 goto unpin_vma;
133
134 err = i915_vma_move_to_active(vma, rq, 0);
135 if (err)
136 goto unpin_hws;
137
138 if (!i915_gem_object_has_active_reference(vma->obj)) {
139 i915_gem_object_get(vma->obj);
140 i915_gem_object_set_active_reference(vma->obj);
141 }
142
143 err = i915_vma_move_to_active(hws, rq, 0);
144 if (err)
145 goto unpin_hws;
146
147 if (!i915_gem_object_has_active_reference(hws->obj)) {
148 i915_gem_object_get(hws->obj);
149 i915_gem_object_set_active_reference(hws->obj);
150 }
151
152 batch = h->batch;
153 if (INTEL_GEN(i915) >= 8) {
154 *batch++ = MI_STORE_DWORD_IMM_GEN4;
155 *batch++ = lower_32_bits(hws_address(hws, rq));
156 *batch++ = upper_32_bits(hws_address(hws, rq));
157 *batch++ = rq->fence.seqno;
158 *batch++ = MI_ARB_CHECK;
159
160 memset(batch, 0, 1024);
161 batch += 1024 / sizeof(*batch);
162
163 *batch++ = MI_ARB_CHECK;
164 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
165 *batch++ = lower_32_bits(vma->node.start);
166 *batch++ = upper_32_bits(vma->node.start);
167 } else if (INTEL_GEN(i915) >= 6) {
168 *batch++ = MI_STORE_DWORD_IMM_GEN4;
169 *batch++ = 0;
170 *batch++ = lower_32_bits(hws_address(hws, rq));
171 *batch++ = rq->fence.seqno;
172 *batch++ = MI_ARB_CHECK;
173
174 memset(batch, 0, 1024);
175 batch += 1024 / sizeof(*batch);
176
177 *batch++ = MI_ARB_CHECK;
178 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
179 *batch++ = lower_32_bits(vma->node.start);
180 } else if (INTEL_GEN(i915) >= 4) {
181 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
182 *batch++ = 0;
183 *batch++ = lower_32_bits(hws_address(hws, rq));
184 *batch++ = rq->fence.seqno;
185 *batch++ = MI_ARB_CHECK;
186
187 memset(batch, 0, 1024);
188 batch += 1024 / sizeof(*batch);
189
190 *batch++ = MI_ARB_CHECK;
191 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
192 *batch++ = lower_32_bits(vma->node.start);
193 } else {
194 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
195 *batch++ = lower_32_bits(hws_address(hws, rq));
196 *batch++ = rq->fence.seqno;
197 *batch++ = MI_ARB_CHECK;
198
199 memset(batch, 0, 1024);
200 batch += 1024 / sizeof(*batch);
201
202 *batch++ = MI_ARB_CHECK;
203 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
204 *batch++ = lower_32_bits(vma->node.start);
205 }
206 *batch++ = MI_BATCH_BUFFER_END;
207 i915_gem_chipset_flush(h->i915);
208
209 flags = 0;
210 if (INTEL_GEN(vm->i915) <= 5)
211 flags |= I915_DISPATCH_SECURE;
212
213 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
214
215unpin_hws:
216 i915_vma_unpin(hws);
217unpin_vma:
218 i915_vma_unpin(vma);
219 return err;
220}
221
222static struct i915_request *
223hang_create_request(struct hang *h, struct intel_engine_cs *engine)
224{
225 struct i915_request *rq;
226 int err;
227
228 if (i915_gem_object_is_active(h->obj)) {
229 struct drm_i915_gem_object *obj;
230 void *vaddr;
231
232 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
233 if (IS_ERR(obj))
234 return ERR_CAST(obj);
235
236 vaddr = i915_gem_object_pin_map(obj,
237 HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
238 if (IS_ERR(vaddr)) {
239 i915_gem_object_put(obj);
240 return ERR_CAST(vaddr);
241 }
242
243 i915_gem_object_unpin_map(h->obj);
244 i915_gem_object_put(h->obj);
245
246 h->obj = obj;
247 h->batch = vaddr;
248 }
249
250 rq = i915_request_alloc(engine, h->ctx);
251 if (IS_ERR(rq))
252 return rq;
253
254 err = emit_recurse_batch(h, rq);
255 if (err) {
256 i915_request_add(rq);
257 return ERR_PTR(err);
258 }
259
260 return rq;
261}
262
263static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
264{
265 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
266}
267
268static void hang_fini(struct hang *h)
269{
270 *h->batch = MI_BATCH_BUFFER_END;
271 i915_gem_chipset_flush(h->i915);
272
273 i915_gem_object_unpin_map(h->obj);
274 i915_gem_object_put(h->obj);
275
276 i915_gem_object_unpin_map(h->hws);
277 i915_gem_object_put(h->hws);
278
279 kernel_context_close(h->ctx);
280
281 igt_flush_test(h->i915, I915_WAIT_LOCKED);
282}
283
284static bool wait_until_running(struct hang *h, struct i915_request *rq)
285{
286 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
287 rq->fence.seqno),
288 10) &&
289 wait_for(i915_seqno_passed(hws_seqno(h, rq),
290 rq->fence.seqno),
291 1000));
292}
293
294static int igt_hang_sanitycheck(void *arg)
295{
296 struct drm_i915_private *i915 = arg;
297 struct i915_request *rq;
298 struct intel_engine_cs *engine;
299 enum intel_engine_id id;
300 struct hang h;
301 int err;
302
303
304
305 mutex_lock(&i915->drm.struct_mutex);
306 err = hang_init(&h, i915);
307 if (err)
308 goto unlock;
309
310 for_each_engine(engine, i915, id) {
311 long timeout;
312
313 if (!intel_engine_can_store_dword(engine))
314 continue;
315
316 rq = hang_create_request(&h, engine);
317 if (IS_ERR(rq)) {
318 err = PTR_ERR(rq);
319 pr_err("Failed to create request for %s, err=%d\n",
320 engine->name, err);
321 goto fini;
322 }
323
324 i915_request_get(rq);
325
326 *h.batch = MI_BATCH_BUFFER_END;
327 i915_gem_chipset_flush(i915);
328
329 i915_request_add(rq);
330
331 timeout = i915_request_wait(rq,
332 I915_WAIT_LOCKED,
333 MAX_SCHEDULE_TIMEOUT);
334 i915_request_put(rq);
335
336 if (timeout < 0) {
337 err = timeout;
338 pr_err("Wait for request failed on %s, err=%d\n",
339 engine->name, err);
340 goto fini;
341 }
342 }
343
344fini:
345 hang_fini(&h);
346unlock:
347 mutex_unlock(&i915->drm.struct_mutex);
348 return err;
349}
350
351static void global_reset_lock(struct drm_i915_private *i915)
352{
353 struct intel_engine_cs *engine;
354 enum intel_engine_id id;
355
356 pr_debug("%s: current gpu_error=%08lx\n",
357 __func__, i915->gpu_error.flags);
358
359 while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
360 wait_event(i915->gpu_error.reset_queue,
361 !test_bit(I915_RESET_BACKOFF,
362 &i915->gpu_error.flags));
363
364 for_each_engine(engine, i915, id) {
365 while (test_and_set_bit(I915_RESET_ENGINE + id,
366 &i915->gpu_error.flags))
367 wait_on_bit(&i915->gpu_error.flags,
368 I915_RESET_ENGINE + id,
369 TASK_UNINTERRUPTIBLE);
370 }
371}
372
373static void global_reset_unlock(struct drm_i915_private *i915)
374{
375 struct intel_engine_cs *engine;
376 enum intel_engine_id id;
377
378 for_each_engine(engine, i915, id)
379 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
380
381 clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
382 wake_up_all(&i915->gpu_error.reset_queue);
383}
384
385static int igt_global_reset(void *arg)
386{
387 struct drm_i915_private *i915 = arg;
388 unsigned int reset_count;
389 int err = 0;
390
391
392
393 global_reset_lock(i915);
394 set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
395
396 mutex_lock(&i915->drm.struct_mutex);
397 reset_count = i915_reset_count(&i915->gpu_error);
398
399 i915_reset(i915, ALL_ENGINES, NULL);
400
401 if (i915_reset_count(&i915->gpu_error) == reset_count) {
402 pr_err("No GPU reset recorded!\n");
403 err = -EINVAL;
404 }
405 mutex_unlock(&i915->drm.struct_mutex);
406
407 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
408 global_reset_unlock(i915);
409
410 if (i915_terminally_wedged(&i915->gpu_error))
411 err = -EIO;
412
413 return err;
414}
415
416static bool wait_for_idle(struct intel_engine_cs *engine)
417{
418 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
419}
420
421static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
422{
423 struct intel_engine_cs *engine;
424 enum intel_engine_id id;
425 struct hang h;
426 int err = 0;
427
428
429
430 if (!intel_has_reset_engine(i915))
431 return 0;
432
433 if (active) {
434 mutex_lock(&i915->drm.struct_mutex);
435 err = hang_init(&h, i915);
436 mutex_unlock(&i915->drm.struct_mutex);
437 if (err)
438 return err;
439 }
440
441 for_each_engine(engine, i915, id) {
442 unsigned int reset_count, reset_engine_count;
443 IGT_TIMEOUT(end_time);
444
445 if (active && !intel_engine_can_store_dword(engine))
446 continue;
447
448 if (!wait_for_idle(engine)) {
449 pr_err("%s failed to idle before reset\n",
450 engine->name);
451 err = -EIO;
452 break;
453 }
454
455 reset_count = i915_reset_count(&i915->gpu_error);
456 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
457 engine);
458
459 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
460 do {
461 u32 seqno = intel_engine_get_seqno(engine);
462
463 if (active) {
464 struct i915_request *rq;
465
466 mutex_lock(&i915->drm.struct_mutex);
467 rq = hang_create_request(&h, engine);
468 if (IS_ERR(rq)) {
469 err = PTR_ERR(rq);
470 mutex_unlock(&i915->drm.struct_mutex);
471 break;
472 }
473
474 i915_request_get(rq);
475 i915_request_add(rq);
476 mutex_unlock(&i915->drm.struct_mutex);
477
478 if (!wait_until_running(&h, rq)) {
479 struct drm_printer p = drm_info_printer(i915->drm.dev);
480
481 pr_err("%s: Failed to start request %x, at %x\n",
482 __func__, rq->fence.seqno, hws_seqno(&h, rq));
483 intel_engine_dump(engine, &p,
484 "%s\n", engine->name);
485
486 i915_request_put(rq);
487 err = -EIO;
488 break;
489 }
490
491 GEM_BUG_ON(!rq->global_seqno);
492 seqno = rq->global_seqno - 1;
493 i915_request_put(rq);
494 }
495
496 err = i915_reset_engine(engine, NULL);
497 if (err) {
498 pr_err("i915_reset_engine failed\n");
499 break;
500 }
501
502 if (i915_reset_count(&i915->gpu_error) != reset_count) {
503 pr_err("Full GPU reset recorded! (engine reset expected)\n");
504 err = -EINVAL;
505 break;
506 }
507
508 reset_engine_count += active;
509 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
510 reset_engine_count) {
511 pr_err("%s engine reset %srecorded!\n",
512 engine->name, active ? "not " : "");
513 err = -EINVAL;
514 break;
515 }
516
517 if (!wait_for_idle(engine)) {
518 struct drm_printer p =
519 drm_info_printer(i915->drm.dev);
520
521 pr_err("%s failed to idle after reset\n",
522 engine->name);
523 intel_engine_dump(engine, &p,
524 "%s\n", engine->name);
525
526 err = -EIO;
527 break;
528 }
529 } while (time_before(jiffies, end_time));
530 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
531
532 if (err)
533 break;
534
535 err = igt_flush_test(i915, 0);
536 if (err)
537 break;
538 }
539
540 if (i915_terminally_wedged(&i915->gpu_error))
541 err = -EIO;
542
543 if (active) {
544 mutex_lock(&i915->drm.struct_mutex);
545 hang_fini(&h);
546 mutex_unlock(&i915->drm.struct_mutex);
547 }
548
549 return err;
550}
551
552static int igt_reset_idle_engine(void *arg)
553{
554 return __igt_reset_engine(arg, false);
555}
556
557static int igt_reset_active_engine(void *arg)
558{
559 return __igt_reset_engine(arg, true);
560}
561
562struct active_engine {
563 struct task_struct *task;
564 struct intel_engine_cs *engine;
565 unsigned long resets;
566 unsigned int flags;
567};
568
569#define TEST_ACTIVE BIT(0)
570#define TEST_OTHERS BIT(1)
571#define TEST_SELF BIT(2)
572#define TEST_PRIORITY BIT(3)
573
574static int active_request_put(struct i915_request *rq)
575{
576 int err = 0;
577
578 if (!rq)
579 return 0;
580
581 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
582 GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
583 rq->engine->name,
584 rq->fence.context,
585 rq->fence.seqno,
586 i915_request_global_seqno(rq));
587 GEM_TRACE_DUMP();
588
589 i915_gem_set_wedged(rq->i915);
590 err = -EIO;
591 }
592
593 i915_request_put(rq);
594
595 return err;
596}
597
598static int active_engine(void *data)
599{
600 I915_RND_STATE(prng);
601 struct active_engine *arg = data;
602 struct intel_engine_cs *engine = arg->engine;
603 struct i915_request *rq[8] = {};
604 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
605 struct drm_file *file;
606 unsigned long count = 0;
607 int err = 0;
608
609 file = mock_file(engine->i915);
610 if (IS_ERR(file))
611 return PTR_ERR(file);
612
613 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
614 mutex_lock(&engine->i915->drm.struct_mutex);
615 ctx[count] = live_context(engine->i915, file);
616 mutex_unlock(&engine->i915->drm.struct_mutex);
617 if (IS_ERR(ctx[count])) {
618 err = PTR_ERR(ctx[count]);
619 while (--count)
620 i915_gem_context_put(ctx[count]);
621 goto err_file;
622 }
623 }
624
625 while (!kthread_should_stop()) {
626 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
627 struct i915_request *old = rq[idx];
628 struct i915_request *new;
629
630 mutex_lock(&engine->i915->drm.struct_mutex);
631 new = i915_request_alloc(engine, ctx[idx]);
632 if (IS_ERR(new)) {
633 mutex_unlock(&engine->i915->drm.struct_mutex);
634 err = PTR_ERR(new);
635 break;
636 }
637
638 if (arg->flags & TEST_PRIORITY)
639 ctx[idx]->sched.priority =
640 i915_prandom_u32_max_state(512, &prng);
641
642 rq[idx] = i915_request_get(new);
643 i915_request_add(new);
644 mutex_unlock(&engine->i915->drm.struct_mutex);
645
646 err = active_request_put(old);
647 if (err)
648 break;
649
650 cond_resched();
651 }
652
653 for (count = 0; count < ARRAY_SIZE(rq); count++) {
654 int err__ = active_request_put(rq[count]);
655
656
657 if (!err)
658 err = err__;
659 }
660
661err_file:
662 mock_file_free(engine->i915, file);
663 return err;
664}
665
666static int __igt_reset_engines(struct drm_i915_private *i915,
667 const char *test_name,
668 unsigned int flags)
669{
670 struct intel_engine_cs *engine, *other;
671 enum intel_engine_id id, tmp;
672 struct hang h;
673 int err = 0;
674
675
676
677
678
679 if (!intel_has_reset_engine(i915))
680 return 0;
681
682 if (flags & TEST_ACTIVE) {
683 mutex_lock(&i915->drm.struct_mutex);
684 err = hang_init(&h, i915);
685 mutex_unlock(&i915->drm.struct_mutex);
686 if (err)
687 return err;
688
689 if (flags & TEST_PRIORITY)
690 h.ctx->sched.priority = 1024;
691 }
692
693 for_each_engine(engine, i915, id) {
694 struct active_engine threads[I915_NUM_ENGINES] = {};
695 unsigned long global = i915_reset_count(&i915->gpu_error);
696 unsigned long count = 0, reported;
697 IGT_TIMEOUT(end_time);
698
699 if (flags & TEST_ACTIVE &&
700 !intel_engine_can_store_dword(engine))
701 continue;
702
703 if (!wait_for_idle(engine)) {
704 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
705 engine->name, test_name);
706 err = -EIO;
707 break;
708 }
709
710 memset(threads, 0, sizeof(threads));
711 for_each_engine(other, i915, tmp) {
712 struct task_struct *tsk;
713
714 threads[tmp].resets =
715 i915_reset_engine_count(&i915->gpu_error,
716 other);
717
718 if (!(flags & TEST_OTHERS))
719 continue;
720
721 if (other == engine && !(flags & TEST_SELF))
722 continue;
723
724 threads[tmp].engine = other;
725 threads[tmp].flags = flags;
726
727 tsk = kthread_run(active_engine, &threads[tmp],
728 "igt/%s", other->name);
729 if (IS_ERR(tsk)) {
730 err = PTR_ERR(tsk);
731 goto unwind;
732 }
733
734 threads[tmp].task = tsk;
735 get_task_struct(tsk);
736 }
737
738 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739 do {
740 u32 seqno = intel_engine_get_seqno(engine);
741 struct i915_request *rq = NULL;
742
743 if (flags & TEST_ACTIVE) {
744 mutex_lock(&i915->drm.struct_mutex);
745 rq = hang_create_request(&h, engine);
746 if (IS_ERR(rq)) {
747 err = PTR_ERR(rq);
748 mutex_unlock(&i915->drm.struct_mutex);
749 break;
750 }
751
752 i915_request_get(rq);
753 i915_request_add(rq);
754 mutex_unlock(&i915->drm.struct_mutex);
755
756 if (!wait_until_running(&h, rq)) {
757 struct drm_printer p = drm_info_printer(i915->drm.dev);
758
759 pr_err("%s: Failed to start request %x, at %x\n",
760 __func__, rq->fence.seqno, hws_seqno(&h, rq));
761 intel_engine_dump(engine, &p,
762 "%s\n", engine->name);
763
764 i915_request_put(rq);
765 err = -EIO;
766 break;
767 }
768
769 GEM_BUG_ON(!rq->global_seqno);
770 seqno = rq->global_seqno - 1;
771 }
772
773 err = i915_reset_engine(engine, NULL);
774 if (err) {
775 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
776 engine->name, test_name, err);
777 break;
778 }
779
780 count++;
781
782 if (rq) {
783 i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
784 i915_request_put(rq);
785 }
786
787 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
788 struct drm_printer p =
789 drm_info_printer(i915->drm.dev);
790
791 pr_err("i915_reset_engine(%s:%s):"
792 " failed to idle after reset\n",
793 engine->name, test_name);
794 intel_engine_dump(engine, &p,
795 "%s\n", engine->name);
796
797 err = -EIO;
798 break;
799 }
800 } while (time_before(jiffies, end_time));
801 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
802 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
803 engine->name, test_name, count);
804
805 reported = i915_reset_engine_count(&i915->gpu_error, engine);
806 reported -= threads[engine->id].resets;
807 if (reported != (flags & TEST_ACTIVE ? count : 0)) {
808 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
809 engine->name, test_name, count, reported,
810 (flags & TEST_ACTIVE ? count : 0));
811 if (!err)
812 err = -EINVAL;
813 }
814
815unwind:
816 for_each_engine(other, i915, tmp) {
817 int ret;
818
819 if (!threads[tmp].task)
820 continue;
821
822 ret = kthread_stop(threads[tmp].task);
823 if (ret) {
824 pr_err("kthread for other engine %s failed, err=%d\n",
825 other->name, ret);
826 if (!err)
827 err = ret;
828 }
829 put_task_struct(threads[tmp].task);
830
831 if (other != engine &&
832 threads[tmp].resets !=
833 i915_reset_engine_count(&i915->gpu_error, other)) {
834 pr_err("Innocent engine %s was reset (count=%ld)\n",
835 other->name,
836 i915_reset_engine_count(&i915->gpu_error,
837 other) -
838 threads[tmp].resets);
839 if (!err)
840 err = -EINVAL;
841 }
842 }
843
844 if (global != i915_reset_count(&i915->gpu_error)) {
845 pr_err("Global reset (count=%ld)!\n",
846 i915_reset_count(&i915->gpu_error) - global);
847 if (!err)
848 err = -EINVAL;
849 }
850
851 if (err)
852 break;
853
854 err = igt_flush_test(i915, 0);
855 if (err)
856 break;
857 }
858
859 if (i915_terminally_wedged(&i915->gpu_error))
860 err = -EIO;
861
862 if (flags & TEST_ACTIVE) {
863 mutex_lock(&i915->drm.struct_mutex);
864 hang_fini(&h);
865 mutex_unlock(&i915->drm.struct_mutex);
866 }
867
868 return err;
869}
870
871static int igt_reset_engines(void *arg)
872{
873 static const struct {
874 const char *name;
875 unsigned int flags;
876 } phases[] = {
877 { "idle", 0 },
878 { "active", TEST_ACTIVE },
879 { "others-idle", TEST_OTHERS },
880 { "others-active", TEST_OTHERS | TEST_ACTIVE },
881 {
882 "others-priority",
883 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
884 },
885 {
886 "self-priority",
887 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
888 },
889 { }
890 };
891 struct drm_i915_private *i915 = arg;
892 typeof(*phases) *p;
893 int err;
894
895 for (p = phases; p->name; p++) {
896 if (p->flags & TEST_PRIORITY) {
897 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
898 continue;
899 }
900
901 err = __igt_reset_engines(arg, p->name, p->flags);
902 if (err)
903 return err;
904 }
905
906 return 0;
907}
908
909static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
910{
911 struct i915_gpu_error *error = &rq->i915->gpu_error;
912 u32 reset_count = i915_reset_count(error);
913
914 error->stalled_mask = mask;
915
916
917 smp_mb__before_atomic();
918 set_bit(I915_RESET_HANDOFF, &error->flags);
919
920 wake_up_all(&error->wait_queue);
921
922 return reset_count;
923}
924
925static int igt_reset_wait(void *arg)
926{
927 struct drm_i915_private *i915 = arg;
928 struct i915_request *rq;
929 unsigned int reset_count;
930 struct hang h;
931 long timeout;
932 int err;
933
934 if (!intel_engine_can_store_dword(i915->engine[RCS]))
935 return 0;
936
937
938
939 global_reset_lock(i915);
940
941 mutex_lock(&i915->drm.struct_mutex);
942 err = hang_init(&h, i915);
943 if (err)
944 goto unlock;
945
946 rq = hang_create_request(&h, i915->engine[RCS]);
947 if (IS_ERR(rq)) {
948 err = PTR_ERR(rq);
949 goto fini;
950 }
951
952 i915_request_get(rq);
953 i915_request_add(rq);
954
955 if (!wait_until_running(&h, rq)) {
956 struct drm_printer p = drm_info_printer(i915->drm.dev);
957
958 pr_err("%s: Failed to start request %x, at %x\n",
959 __func__, rq->fence.seqno, hws_seqno(&h, rq));
960 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
961
962 i915_gem_set_wedged(i915);
963
964 err = -EIO;
965 goto out_rq;
966 }
967
968 reset_count = fake_hangcheck(rq, ALL_ENGINES);
969
970 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
971 if (timeout < 0) {
972 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
973 timeout);
974 err = timeout;
975 goto out_rq;
976 }
977
978 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
979 if (i915_reset_count(&i915->gpu_error) == reset_count) {
980 pr_err("No GPU reset recorded!\n");
981 err = -EINVAL;
982 goto out_rq;
983 }
984
985out_rq:
986 i915_request_put(rq);
987fini:
988 hang_fini(&h);
989unlock:
990 mutex_unlock(&i915->drm.struct_mutex);
991 global_reset_unlock(i915);
992
993 if (i915_terminally_wedged(&i915->gpu_error))
994 return -EIO;
995
996 return err;
997}
998
999struct evict_vma {
1000 struct completion completion;
1001 struct i915_vma *vma;
1002};
1003
1004static int evict_vma(void *data)
1005{
1006 struct evict_vma *arg = data;
1007 struct i915_address_space *vm = arg->vma->vm;
1008 struct drm_i915_private *i915 = vm->i915;
1009 struct drm_mm_node evict = arg->vma->node;
1010 int err;
1011
1012 complete(&arg->completion);
1013
1014 mutex_lock(&i915->drm.struct_mutex);
1015 err = i915_gem_evict_for_node(vm, &evict, 0);
1016 mutex_unlock(&i915->drm.struct_mutex);
1017
1018 return err;
1019}
1020
1021static int evict_fence(void *data)
1022{
1023 struct evict_vma *arg = data;
1024 struct drm_i915_private *i915 = arg->vma->vm->i915;
1025 int err;
1026
1027 complete(&arg->completion);
1028
1029 mutex_lock(&i915->drm.struct_mutex);
1030
1031
1032 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1033 if (err) {
1034 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1035 goto out_unlock;
1036 }
1037
1038 err = i915_vma_pin_fence(arg->vma);
1039 if (err) {
1040 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1041 goto out_unlock;
1042 }
1043
1044 i915_vma_unpin_fence(arg->vma);
1045
1046out_unlock:
1047 mutex_unlock(&i915->drm.struct_mutex);
1048
1049 return err;
1050}
1051
1052static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1053 struct i915_address_space *vm,
1054 int (*fn)(void *),
1055 unsigned int flags)
1056{
1057 struct drm_i915_gem_object *obj;
1058 struct task_struct *tsk = NULL;
1059 struct i915_request *rq;
1060 struct evict_vma arg;
1061 struct hang h;
1062 int err;
1063
1064 if (!intel_engine_can_store_dword(i915->engine[RCS]))
1065 return 0;
1066
1067
1068
1069 global_reset_lock(i915);
1070
1071 mutex_lock(&i915->drm.struct_mutex);
1072 err = hang_init(&h, i915);
1073 if (err)
1074 goto unlock;
1075
1076 obj = i915_gem_object_create_internal(i915, SZ_1M);
1077 if (IS_ERR(obj)) {
1078 err = PTR_ERR(obj);
1079 goto fini;
1080 }
1081
1082 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1083 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1084 if (err) {
1085 pr_err("Invalid X-tiling settings; err:%d\n", err);
1086 goto out_obj;
1087 }
1088 }
1089
1090 arg.vma = i915_vma_instance(obj, vm, NULL);
1091 if (IS_ERR(arg.vma)) {
1092 err = PTR_ERR(arg.vma);
1093 goto out_obj;
1094 }
1095
1096 rq = hang_create_request(&h, i915->engine[RCS]);
1097 if (IS_ERR(rq)) {
1098 err = PTR_ERR(rq);
1099 goto out_obj;
1100 }
1101
1102 err = i915_vma_pin(arg.vma, 0, 0,
1103 i915_vma_is_ggtt(arg.vma) ?
1104 PIN_GLOBAL | PIN_MAPPABLE :
1105 PIN_USER);
1106 if (err) {
1107 i915_request_add(rq);
1108 goto out_obj;
1109 }
1110
1111 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1112 err = i915_vma_pin_fence(arg.vma);
1113 if (err) {
1114 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1115 i915_vma_unpin(arg.vma);
1116 i915_request_add(rq);
1117 goto out_obj;
1118 }
1119 }
1120
1121 err = i915_vma_move_to_active(arg.vma, rq, flags);
1122
1123 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1124 i915_vma_unpin_fence(arg.vma);
1125 i915_vma_unpin(arg.vma);
1126
1127 i915_request_get(rq);
1128 i915_request_add(rq);
1129 if (err)
1130 goto out_rq;
1131
1132 mutex_unlock(&i915->drm.struct_mutex);
1133
1134 if (!wait_until_running(&h, rq)) {
1135 struct drm_printer p = drm_info_printer(i915->drm.dev);
1136
1137 pr_err("%s: Failed to start request %x, at %x\n",
1138 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1139 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1140
1141 i915_gem_set_wedged(i915);
1142 goto out_reset;
1143 }
1144
1145 init_completion(&arg.completion);
1146
1147 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1148 if (IS_ERR(tsk)) {
1149 err = PTR_ERR(tsk);
1150 tsk = NULL;
1151 goto out_reset;
1152 }
1153
1154 wait_for_completion(&arg.completion);
1155
1156 if (wait_for(waitqueue_active(&rq->execute), 10)) {
1157 struct drm_printer p = drm_info_printer(i915->drm.dev);
1158
1159 pr_err("igt/evict_vma kthread did not wait\n");
1160 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1161
1162 i915_gem_set_wedged(i915);
1163 goto out_reset;
1164 }
1165
1166out_reset:
1167 fake_hangcheck(rq, intel_engine_flag(rq->engine));
1168
1169 if (tsk) {
1170 struct igt_wedge_me w;
1171
1172
1173 igt_wedge_on_timeout(&w, i915, HZ / 10 )
1174 err = kthread_stop(tsk);
1175 }
1176
1177 mutex_lock(&i915->drm.struct_mutex);
1178out_rq:
1179 i915_request_put(rq);
1180out_obj:
1181 i915_gem_object_put(obj);
1182fini:
1183 hang_fini(&h);
1184unlock:
1185 mutex_unlock(&i915->drm.struct_mutex);
1186 global_reset_unlock(i915);
1187
1188 if (i915_terminally_wedged(&i915->gpu_error))
1189 return -EIO;
1190
1191 return err;
1192}
1193
1194static int igt_reset_evict_ggtt(void *arg)
1195{
1196 struct drm_i915_private *i915 = arg;
1197
1198 return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1199 evict_vma, EXEC_OBJECT_WRITE);
1200}
1201
1202static int igt_reset_evict_ppgtt(void *arg)
1203{
1204 struct drm_i915_private *i915 = arg;
1205 struct i915_gem_context *ctx;
1206 struct drm_file *file;
1207 int err;
1208
1209 file = mock_file(i915);
1210 if (IS_ERR(file))
1211 return PTR_ERR(file);
1212
1213 mutex_lock(&i915->drm.struct_mutex);
1214 ctx = live_context(i915, file);
1215 mutex_unlock(&i915->drm.struct_mutex);
1216 if (IS_ERR(ctx)) {
1217 err = PTR_ERR(ctx);
1218 goto out;
1219 }
1220
1221 err = 0;
1222 if (ctx->ppgtt)
1223 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1224 evict_vma, EXEC_OBJECT_WRITE);
1225
1226out:
1227 mock_file_free(i915, file);
1228 return err;
1229}
1230
1231static int igt_reset_evict_fence(void *arg)
1232{
1233 struct drm_i915_private *i915 = arg;
1234
1235 return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1236 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1237}
1238
1239static int wait_for_others(struct drm_i915_private *i915,
1240 struct intel_engine_cs *exclude)
1241{
1242 struct intel_engine_cs *engine;
1243 enum intel_engine_id id;
1244
1245 for_each_engine(engine, i915, id) {
1246 if (engine == exclude)
1247 continue;
1248
1249 if (!wait_for_idle(engine))
1250 return -EIO;
1251 }
1252
1253 return 0;
1254}
1255
1256static int igt_reset_queue(void *arg)
1257{
1258 struct drm_i915_private *i915 = arg;
1259 struct intel_engine_cs *engine;
1260 enum intel_engine_id id;
1261 struct hang h;
1262 int err;
1263
1264
1265
1266 global_reset_lock(i915);
1267
1268 mutex_lock(&i915->drm.struct_mutex);
1269 err = hang_init(&h, i915);
1270 if (err)
1271 goto unlock;
1272
1273 for_each_engine(engine, i915, id) {
1274 struct i915_request *prev;
1275 IGT_TIMEOUT(end_time);
1276 unsigned int count;
1277
1278 if (!intel_engine_can_store_dword(engine))
1279 continue;
1280
1281 prev = hang_create_request(&h, engine);
1282 if (IS_ERR(prev)) {
1283 err = PTR_ERR(prev);
1284 goto fini;
1285 }
1286
1287 i915_request_get(prev);
1288 i915_request_add(prev);
1289
1290 count = 0;
1291 do {
1292 struct i915_request *rq;
1293 unsigned int reset_count;
1294
1295 rq = hang_create_request(&h, engine);
1296 if (IS_ERR(rq)) {
1297 err = PTR_ERR(rq);
1298 goto fini;
1299 }
1300
1301 i915_request_get(rq);
1302 i915_request_add(rq);
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314 err = wait_for_others(i915, engine);
1315 if (err) {
1316 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1317 __func__, engine->name);
1318 i915_request_put(rq);
1319 i915_request_put(prev);
1320
1321 GEM_TRACE_DUMP();
1322 i915_gem_set_wedged(i915);
1323 goto fini;
1324 }
1325
1326 if (!wait_until_running(&h, prev)) {
1327 struct drm_printer p = drm_info_printer(i915->drm.dev);
1328
1329 pr_err("%s(%s): Failed to start request %x, at %x\n",
1330 __func__, engine->name,
1331 prev->fence.seqno, hws_seqno(&h, prev));
1332 intel_engine_dump(engine, &p,
1333 "%s\n", engine->name);
1334
1335 i915_request_put(rq);
1336 i915_request_put(prev);
1337
1338 i915_gem_set_wedged(i915);
1339
1340 err = -EIO;
1341 goto fini;
1342 }
1343
1344 reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1345
1346 i915_reset(i915, ENGINE_MASK(id), NULL);
1347
1348 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1349 &i915->gpu_error.flags));
1350
1351 if (prev->fence.error != -EIO) {
1352 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1353 prev->fence.error);
1354 i915_request_put(rq);
1355 i915_request_put(prev);
1356 err = -EINVAL;
1357 goto fini;
1358 }
1359
1360 if (rq->fence.error) {
1361 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1362 rq->fence.error);
1363 i915_request_put(rq);
1364 i915_request_put(prev);
1365 err = -EINVAL;
1366 goto fini;
1367 }
1368
1369 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1370 pr_err("No GPU reset recorded!\n");
1371 i915_request_put(rq);
1372 i915_request_put(prev);
1373 err = -EINVAL;
1374 goto fini;
1375 }
1376
1377 i915_request_put(prev);
1378 prev = rq;
1379 count++;
1380 } while (time_before(jiffies, end_time));
1381 pr_info("%s: Completed %d resets\n", engine->name, count);
1382
1383 *h.batch = MI_BATCH_BUFFER_END;
1384 i915_gem_chipset_flush(i915);
1385
1386 i915_request_put(prev);
1387
1388 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1389 if (err)
1390 break;
1391 }
1392
1393fini:
1394 hang_fini(&h);
1395unlock:
1396 mutex_unlock(&i915->drm.struct_mutex);
1397 global_reset_unlock(i915);
1398
1399 if (i915_terminally_wedged(&i915->gpu_error))
1400 return -EIO;
1401
1402 return err;
1403}
1404
1405static int igt_handle_error(void *arg)
1406{
1407 struct drm_i915_private *i915 = arg;
1408 struct intel_engine_cs *engine = i915->engine[RCS];
1409 struct hang h;
1410 struct i915_request *rq;
1411 struct i915_gpu_state *error;
1412 int err;
1413
1414
1415
1416 if (!intel_has_reset_engine(i915))
1417 return 0;
1418
1419 if (!engine || !intel_engine_can_store_dword(engine))
1420 return 0;
1421
1422 mutex_lock(&i915->drm.struct_mutex);
1423
1424 err = hang_init(&h, i915);
1425 if (err)
1426 goto err_unlock;
1427
1428 rq = hang_create_request(&h, engine);
1429 if (IS_ERR(rq)) {
1430 err = PTR_ERR(rq);
1431 goto err_fini;
1432 }
1433
1434 i915_request_get(rq);
1435 i915_request_add(rq);
1436
1437 if (!wait_until_running(&h, rq)) {
1438 struct drm_printer p = drm_info_printer(i915->drm.dev);
1439
1440 pr_err("%s: Failed to start request %x, at %x\n",
1441 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1442 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1443
1444 i915_gem_set_wedged(i915);
1445
1446 err = -EIO;
1447 goto err_request;
1448 }
1449
1450 mutex_unlock(&i915->drm.struct_mutex);
1451
1452
1453 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1454
1455 i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1456
1457 xchg(&i915->gpu_error.first_error, error);
1458
1459 mutex_lock(&i915->drm.struct_mutex);
1460
1461 if (rq->fence.error != -EIO) {
1462 pr_err("Guilty request not identified!\n");
1463 err = -EINVAL;
1464 goto err_request;
1465 }
1466
1467err_request:
1468 i915_request_put(rq);
1469err_fini:
1470 hang_fini(&h);
1471err_unlock:
1472 mutex_unlock(&i915->drm.struct_mutex);
1473 return err;
1474}
1475
1476int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1477{
1478 static const struct i915_subtest tests[] = {
1479 SUBTEST(igt_global_reset),
1480 SUBTEST(igt_hang_sanitycheck),
1481 SUBTEST(igt_reset_idle_engine),
1482 SUBTEST(igt_reset_active_engine),
1483 SUBTEST(igt_reset_engines),
1484 SUBTEST(igt_reset_queue),
1485 SUBTEST(igt_reset_wait),
1486 SUBTEST(igt_reset_evict_ggtt),
1487 SUBTEST(igt_reset_evict_ppgtt),
1488 SUBTEST(igt_reset_evict_fence),
1489 SUBTEST(igt_handle_error),
1490 };
1491 bool saved_hangcheck;
1492 int err;
1493
1494 if (!intel_has_gpu_reset(i915))
1495 return 0;
1496
1497 if (i915_terminally_wedged(&i915->gpu_error))
1498 return -EIO;
1499
1500 intel_runtime_pm_get(i915);
1501 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1502
1503 err = i915_subtests(tests, i915);
1504
1505 mutex_lock(&i915->drm.struct_mutex);
1506 igt_flush_test(i915, I915_WAIT_LOCKED);
1507 mutex_unlock(&i915->drm.struct_mutex);
1508
1509 i915_modparams.enable_hangcheck = saved_hangcheck;
1510 intel_runtime_pm_put(i915);
1511
1512 return err;
1513}
1514