1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25#include <linux/kthread.h>
26
27#include "../i915_selftest.h"
28#include "i915_random.h"
29#include "igt_flush_test.h"
30#include "igt_reset.h"
31#include "igt_wedge_me.h"
32
33#include "mock_context.h"
34#include "mock_drm.h"
35
36#define IGT_IDLE_TIMEOUT 50
37
38struct hang {
39 struct drm_i915_private *i915;
40 struct drm_i915_gem_object *hws;
41 struct drm_i915_gem_object *obj;
42 struct i915_gem_context *ctx;
43 u32 *seqno;
44 u32 *batch;
45};
46
47static int hang_init(struct hang *h, struct drm_i915_private *i915)
48{
49 void *vaddr;
50 int err;
51
52 memset(h, 0, sizeof(*h));
53 h->i915 = i915;
54
55 h->ctx = kernel_context(i915);
56 if (IS_ERR(h->ctx))
57 return PTR_ERR(h->ctx);
58
59 GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
60
61 h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
62 if (IS_ERR(h->hws)) {
63 err = PTR_ERR(h->hws);
64 goto err_ctx;
65 }
66
67 h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
68 if (IS_ERR(h->obj)) {
69 err = PTR_ERR(h->obj);
70 goto err_hws;
71 }
72
73 i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
74 vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
75 if (IS_ERR(vaddr)) {
76 err = PTR_ERR(vaddr);
77 goto err_obj;
78 }
79 h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
80
81 vaddr = i915_gem_object_pin_map(h->obj,
82 i915_coherent_map_type(i915));
83 if (IS_ERR(vaddr)) {
84 err = PTR_ERR(vaddr);
85 goto err_unpin_hws;
86 }
87 h->batch = vaddr;
88
89 return 0;
90
91err_unpin_hws:
92 i915_gem_object_unpin_map(h->hws);
93err_obj:
94 i915_gem_object_put(h->obj);
95err_hws:
96 i915_gem_object_put(h->hws);
97err_ctx:
98 kernel_context_close(h->ctx);
99 return err;
100}
101
102static u64 hws_address(const struct i915_vma *hws,
103 const struct i915_request *rq)
104{
105 return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
106}
107
108static int move_to_active(struct i915_vma *vma,
109 struct i915_request *rq,
110 unsigned int flags)
111{
112 int err;
113
114 err = i915_vma_move_to_active(vma, rq, flags);
115 if (err)
116 return err;
117
118 if (!i915_gem_object_has_active_reference(vma->obj)) {
119 i915_gem_object_get(vma->obj);
120 i915_gem_object_set_active_reference(vma->obj);
121 }
122
123 return 0;
124}
125
126static struct i915_request *
127hang_create_request(struct hang *h, struct intel_engine_cs *engine)
128{
129 struct drm_i915_private *i915 = h->i915;
130 struct i915_address_space *vm =
131 h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm;
132 struct i915_request *rq = NULL;
133 struct i915_vma *hws, *vma;
134 unsigned int flags;
135 u32 *batch;
136 int err;
137
138 if (i915_gem_object_is_active(h->obj)) {
139 struct drm_i915_gem_object *obj;
140 void *vaddr;
141
142 obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
143 if (IS_ERR(obj))
144 return ERR_CAST(obj);
145
146 vaddr = i915_gem_object_pin_map(obj,
147 i915_coherent_map_type(h->i915));
148 if (IS_ERR(vaddr)) {
149 i915_gem_object_put(obj);
150 return ERR_CAST(vaddr);
151 }
152
153 i915_gem_object_unpin_map(h->obj);
154 i915_gem_object_put(h->obj);
155
156 h->obj = obj;
157 h->batch = vaddr;
158 }
159
160 vma = i915_vma_instance(h->obj, vm, NULL);
161 if (IS_ERR(vma))
162 return ERR_CAST(vma);
163
164 hws = i915_vma_instance(h->hws, vm, NULL);
165 if (IS_ERR(hws))
166 return ERR_CAST(hws);
167
168 err = i915_vma_pin(vma, 0, 0, PIN_USER);
169 if (err)
170 return ERR_PTR(err);
171
172 err = i915_vma_pin(hws, 0, 0, PIN_USER);
173 if (err)
174 goto unpin_vma;
175
176 rq = i915_request_alloc(engine, h->ctx);
177 if (IS_ERR(rq)) {
178 err = PTR_ERR(rq);
179 goto unpin_hws;
180 }
181
182 err = move_to_active(vma, rq, 0);
183 if (err)
184 goto cancel_rq;
185
186 err = move_to_active(hws, rq, 0);
187 if (err)
188 goto cancel_rq;
189
190 batch = h->batch;
191 if (INTEL_GEN(i915) >= 8) {
192 *batch++ = MI_STORE_DWORD_IMM_GEN4;
193 *batch++ = lower_32_bits(hws_address(hws, rq));
194 *batch++ = upper_32_bits(hws_address(hws, rq));
195 *batch++ = rq->fence.seqno;
196 *batch++ = MI_ARB_CHECK;
197
198 memset(batch, 0, 1024);
199 batch += 1024 / sizeof(*batch);
200
201 *batch++ = MI_ARB_CHECK;
202 *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
203 *batch++ = lower_32_bits(vma->node.start);
204 *batch++ = upper_32_bits(vma->node.start);
205 } else if (INTEL_GEN(i915) >= 6) {
206 *batch++ = MI_STORE_DWORD_IMM_GEN4;
207 *batch++ = 0;
208 *batch++ = lower_32_bits(hws_address(hws, rq));
209 *batch++ = rq->fence.seqno;
210 *batch++ = MI_ARB_CHECK;
211
212 memset(batch, 0, 1024);
213 batch += 1024 / sizeof(*batch);
214
215 *batch++ = MI_ARB_CHECK;
216 *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
217 *batch++ = lower_32_bits(vma->node.start);
218 } else if (INTEL_GEN(i915) >= 4) {
219 *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
220 *batch++ = 0;
221 *batch++ = lower_32_bits(hws_address(hws, rq));
222 *batch++ = rq->fence.seqno;
223 *batch++ = MI_ARB_CHECK;
224
225 memset(batch, 0, 1024);
226 batch += 1024 / sizeof(*batch);
227
228 *batch++ = MI_ARB_CHECK;
229 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
230 *batch++ = lower_32_bits(vma->node.start);
231 } else {
232 *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
233 *batch++ = lower_32_bits(hws_address(hws, rq));
234 *batch++ = rq->fence.seqno;
235 *batch++ = MI_ARB_CHECK;
236
237 memset(batch, 0, 1024);
238 batch += 1024 / sizeof(*batch);
239
240 *batch++ = MI_ARB_CHECK;
241 *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
242 *batch++ = lower_32_bits(vma->node.start);
243 }
244 *batch++ = MI_BATCH_BUFFER_END;
245 i915_gem_chipset_flush(h->i915);
246
247 if (rq->engine->emit_init_breadcrumb) {
248 err = rq->engine->emit_init_breadcrumb(rq);
249 if (err)
250 goto cancel_rq;
251 }
252
253 flags = 0;
254 if (INTEL_GEN(vm->i915) <= 5)
255 flags |= I915_DISPATCH_SECURE;
256
257 err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
258
259cancel_rq:
260 if (err) {
261 i915_request_skip(rq, err);
262 i915_request_add(rq);
263 }
264unpin_hws:
265 i915_vma_unpin(hws);
266unpin_vma:
267 i915_vma_unpin(vma);
268 return err ? ERR_PTR(err) : rq;
269}
270
271static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
272{
273 return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
274}
275
276static void hang_fini(struct hang *h)
277{
278 *h->batch = MI_BATCH_BUFFER_END;
279 i915_gem_chipset_flush(h->i915);
280
281 i915_gem_object_unpin_map(h->obj);
282 i915_gem_object_put(h->obj);
283
284 i915_gem_object_unpin_map(h->hws);
285 i915_gem_object_put(h->hws);
286
287 kernel_context_close(h->ctx);
288
289 igt_flush_test(h->i915, I915_WAIT_LOCKED);
290}
291
292static bool wait_until_running(struct hang *h, struct i915_request *rq)
293{
294 return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
295 rq->fence.seqno),
296 10) &&
297 wait_for(i915_seqno_passed(hws_seqno(h, rq),
298 rq->fence.seqno),
299 1000));
300}
301
302static int igt_hang_sanitycheck(void *arg)
303{
304 struct drm_i915_private *i915 = arg;
305 struct i915_request *rq;
306 struct intel_engine_cs *engine;
307 enum intel_engine_id id;
308 struct hang h;
309 int err;
310
311
312
313 mutex_lock(&i915->drm.struct_mutex);
314 err = hang_init(&h, i915);
315 if (err)
316 goto unlock;
317
318 for_each_engine(engine, i915, id) {
319 struct igt_wedge_me w;
320 long timeout;
321
322 if (!intel_engine_can_store_dword(engine))
323 continue;
324
325 rq = hang_create_request(&h, engine);
326 if (IS_ERR(rq)) {
327 err = PTR_ERR(rq);
328 pr_err("Failed to create request for %s, err=%d\n",
329 engine->name, err);
330 goto fini;
331 }
332
333 i915_request_get(rq);
334
335 *h.batch = MI_BATCH_BUFFER_END;
336 i915_gem_chipset_flush(i915);
337
338 i915_request_add(rq);
339
340 timeout = 0;
341 igt_wedge_on_timeout(&w, i915, HZ / 10 )
342 timeout = i915_request_wait(rq,
343 I915_WAIT_LOCKED,
344 MAX_SCHEDULE_TIMEOUT);
345 if (i915_reset_failed(i915))
346 timeout = -EIO;
347
348 i915_request_put(rq);
349
350 if (timeout < 0) {
351 err = timeout;
352 pr_err("Wait for request failed on %s, err=%d\n",
353 engine->name, err);
354 goto fini;
355 }
356 }
357
358fini:
359 hang_fini(&h);
360unlock:
361 mutex_unlock(&i915->drm.struct_mutex);
362 return err;
363}
364
365static int igt_global_reset(void *arg)
366{
367 struct drm_i915_private *i915 = arg;
368 unsigned int reset_count;
369 int err = 0;
370
371
372
373 igt_global_reset_lock(i915);
374
375 reset_count = i915_reset_count(&i915->gpu_error);
376
377 i915_reset(i915, ALL_ENGINES, NULL);
378
379 if (i915_reset_count(&i915->gpu_error) == reset_count) {
380 pr_err("No GPU reset recorded!\n");
381 err = -EINVAL;
382 }
383
384 igt_global_reset_unlock(i915);
385
386 if (i915_reset_failed(i915))
387 err = -EIO;
388
389 return err;
390}
391
392static int igt_wedged_reset(void *arg)
393{
394 struct drm_i915_private *i915 = arg;
395 intel_wakeref_t wakeref;
396
397
398
399 igt_global_reset_lock(i915);
400 wakeref = intel_runtime_pm_get(i915);
401
402 i915_gem_set_wedged(i915);
403
404 GEM_BUG_ON(!i915_reset_failed(i915));
405 i915_reset(i915, ALL_ENGINES, NULL);
406
407 intel_runtime_pm_put(i915, wakeref);
408 igt_global_reset_unlock(i915);
409
410 return i915_reset_failed(i915) ? -EIO : 0;
411}
412
413static bool wait_for_idle(struct intel_engine_cs *engine)
414{
415 return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
416}
417
418static int igt_reset_nop(void *arg)
419{
420 struct drm_i915_private *i915 = arg;
421 struct intel_engine_cs *engine;
422 struct i915_gem_context *ctx;
423 unsigned int reset_count, count;
424 enum intel_engine_id id;
425 intel_wakeref_t wakeref;
426 struct drm_file *file;
427 IGT_TIMEOUT(end_time);
428 int err = 0;
429
430
431
432 file = mock_file(i915);
433 if (IS_ERR(file))
434 return PTR_ERR(file);
435
436 mutex_lock(&i915->drm.struct_mutex);
437 ctx = live_context(i915, file);
438 mutex_unlock(&i915->drm.struct_mutex);
439 if (IS_ERR(ctx)) {
440 err = PTR_ERR(ctx);
441 goto out;
442 }
443
444 i915_gem_context_clear_bannable(ctx);
445 wakeref = intel_runtime_pm_get(i915);
446 reset_count = i915_reset_count(&i915->gpu_error);
447 count = 0;
448 do {
449 mutex_lock(&i915->drm.struct_mutex);
450 for_each_engine(engine, i915, id) {
451 int i;
452
453 for (i = 0; i < 16; i++) {
454 struct i915_request *rq;
455
456 rq = i915_request_alloc(engine, ctx);
457 if (IS_ERR(rq)) {
458 err = PTR_ERR(rq);
459 break;
460 }
461
462 i915_request_add(rq);
463 }
464 }
465 mutex_unlock(&i915->drm.struct_mutex);
466
467 igt_global_reset_lock(i915);
468 i915_reset(i915, ALL_ENGINES, NULL);
469 igt_global_reset_unlock(i915);
470 if (i915_reset_failed(i915)) {
471 err = -EIO;
472 break;
473 }
474
475 if (i915_reset_count(&i915->gpu_error) !=
476 reset_count + ++count) {
477 pr_err("Full GPU reset not recorded!\n");
478 err = -EINVAL;
479 break;
480 }
481
482 if (!i915_reset_flush(i915)) {
483 struct drm_printer p =
484 drm_info_printer(i915->drm.dev);
485
486 pr_err("%s failed to idle after reset\n",
487 engine->name);
488 intel_engine_dump(engine, &p,
489 "%s\n", engine->name);
490
491 err = -EIO;
492 break;
493 }
494
495 err = igt_flush_test(i915, 0);
496 if (err)
497 break;
498 } while (time_before(jiffies, end_time));
499 pr_info("%s: %d resets\n", __func__, count);
500
501 mutex_lock(&i915->drm.struct_mutex);
502 err = igt_flush_test(i915, I915_WAIT_LOCKED);
503 mutex_unlock(&i915->drm.struct_mutex);
504
505 intel_runtime_pm_put(i915, wakeref);
506
507out:
508 mock_file_free(i915, file);
509 if (i915_reset_failed(i915))
510 err = -EIO;
511 return err;
512}
513
514static int igt_reset_nop_engine(void *arg)
515{
516 struct drm_i915_private *i915 = arg;
517 struct intel_engine_cs *engine;
518 struct i915_gem_context *ctx;
519 enum intel_engine_id id;
520 intel_wakeref_t wakeref;
521 struct drm_file *file;
522 int err = 0;
523
524
525
526 if (!intel_has_reset_engine(i915))
527 return 0;
528
529 file = mock_file(i915);
530 if (IS_ERR(file))
531 return PTR_ERR(file);
532
533 mutex_lock(&i915->drm.struct_mutex);
534 ctx = live_context(i915, file);
535 mutex_unlock(&i915->drm.struct_mutex);
536 if (IS_ERR(ctx)) {
537 err = PTR_ERR(ctx);
538 goto out;
539 }
540
541 i915_gem_context_clear_bannable(ctx);
542 wakeref = intel_runtime_pm_get(i915);
543 for_each_engine(engine, i915, id) {
544 unsigned int reset_count, reset_engine_count;
545 unsigned int count;
546 IGT_TIMEOUT(end_time);
547
548 reset_count = i915_reset_count(&i915->gpu_error);
549 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
550 engine);
551 count = 0;
552
553 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
554 do {
555 int i;
556
557 if (!wait_for_idle(engine)) {
558 pr_err("%s failed to idle before reset\n",
559 engine->name);
560 err = -EIO;
561 break;
562 }
563
564 mutex_lock(&i915->drm.struct_mutex);
565 for (i = 0; i < 16; i++) {
566 struct i915_request *rq;
567
568 rq = i915_request_alloc(engine, ctx);
569 if (IS_ERR(rq)) {
570 err = PTR_ERR(rq);
571 break;
572 }
573
574 i915_request_add(rq);
575 }
576 mutex_unlock(&i915->drm.struct_mutex);
577
578 err = i915_reset_engine(engine, NULL);
579 if (err) {
580 pr_err("i915_reset_engine failed\n");
581 break;
582 }
583
584 if (i915_reset_count(&i915->gpu_error) != reset_count) {
585 pr_err("Full GPU reset recorded! (engine reset expected)\n");
586 err = -EINVAL;
587 break;
588 }
589
590 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
591 reset_engine_count + ++count) {
592 pr_err("%s engine reset not recorded!\n",
593 engine->name);
594 err = -EINVAL;
595 break;
596 }
597
598 if (!i915_reset_flush(i915)) {
599 struct drm_printer p =
600 drm_info_printer(i915->drm.dev);
601
602 pr_err("%s failed to idle after reset\n",
603 engine->name);
604 intel_engine_dump(engine, &p,
605 "%s\n", engine->name);
606
607 err = -EIO;
608 break;
609 }
610 } while (time_before(jiffies, end_time));
611 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
612 pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
613
614 if (err)
615 break;
616
617 err = igt_flush_test(i915, 0);
618 if (err)
619 break;
620 }
621
622 mutex_lock(&i915->drm.struct_mutex);
623 err = igt_flush_test(i915, I915_WAIT_LOCKED);
624 mutex_unlock(&i915->drm.struct_mutex);
625
626 intel_runtime_pm_put(i915, wakeref);
627out:
628 mock_file_free(i915, file);
629 if (i915_reset_failed(i915))
630 err = -EIO;
631 return err;
632}
633
634static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
635{
636 struct intel_engine_cs *engine;
637 enum intel_engine_id id;
638 struct hang h;
639 int err = 0;
640
641
642
643 if (!intel_has_reset_engine(i915))
644 return 0;
645
646 if (active) {
647 mutex_lock(&i915->drm.struct_mutex);
648 err = hang_init(&h, i915);
649 mutex_unlock(&i915->drm.struct_mutex);
650 if (err)
651 return err;
652 }
653
654 for_each_engine(engine, i915, id) {
655 unsigned int reset_count, reset_engine_count;
656 IGT_TIMEOUT(end_time);
657
658 if (active && !intel_engine_can_store_dword(engine))
659 continue;
660
661 if (!wait_for_idle(engine)) {
662 pr_err("%s failed to idle before reset\n",
663 engine->name);
664 err = -EIO;
665 break;
666 }
667
668 reset_count = i915_reset_count(&i915->gpu_error);
669 reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
670 engine);
671
672 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
673 do {
674 if (active) {
675 struct i915_request *rq;
676
677 mutex_lock(&i915->drm.struct_mutex);
678 rq = hang_create_request(&h, engine);
679 if (IS_ERR(rq)) {
680 err = PTR_ERR(rq);
681 mutex_unlock(&i915->drm.struct_mutex);
682 break;
683 }
684
685 i915_request_get(rq);
686 i915_request_add(rq);
687 mutex_unlock(&i915->drm.struct_mutex);
688
689 if (!wait_until_running(&h, rq)) {
690 struct drm_printer p = drm_info_printer(i915->drm.dev);
691
692 pr_err("%s: Failed to start request %llx, at %x\n",
693 __func__, rq->fence.seqno, hws_seqno(&h, rq));
694 intel_engine_dump(engine, &p,
695 "%s\n", engine->name);
696
697 i915_request_put(rq);
698 err = -EIO;
699 break;
700 }
701
702 i915_request_put(rq);
703 }
704
705 err = i915_reset_engine(engine, NULL);
706 if (err) {
707 pr_err("i915_reset_engine failed\n");
708 break;
709 }
710
711 if (i915_reset_count(&i915->gpu_error) != reset_count) {
712 pr_err("Full GPU reset recorded! (engine reset expected)\n");
713 err = -EINVAL;
714 break;
715 }
716
717 if (i915_reset_engine_count(&i915->gpu_error, engine) !=
718 ++reset_engine_count) {
719 pr_err("%s engine reset not recorded!\n",
720 engine->name);
721 err = -EINVAL;
722 break;
723 }
724
725 if (!i915_reset_flush(i915)) {
726 struct drm_printer p =
727 drm_info_printer(i915->drm.dev);
728
729 pr_err("%s failed to idle after reset\n",
730 engine->name);
731 intel_engine_dump(engine, &p,
732 "%s\n", engine->name);
733
734 err = -EIO;
735 break;
736 }
737 } while (time_before(jiffies, end_time));
738 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
739
740 if (err)
741 break;
742
743 err = igt_flush_test(i915, 0);
744 if (err)
745 break;
746 }
747
748 if (i915_reset_failed(i915))
749 err = -EIO;
750
751 if (active) {
752 mutex_lock(&i915->drm.struct_mutex);
753 hang_fini(&h);
754 mutex_unlock(&i915->drm.struct_mutex);
755 }
756
757 return err;
758}
759
760static int igt_reset_idle_engine(void *arg)
761{
762 return __igt_reset_engine(arg, false);
763}
764
765static int igt_reset_active_engine(void *arg)
766{
767 return __igt_reset_engine(arg, true);
768}
769
770struct active_engine {
771 struct task_struct *task;
772 struct intel_engine_cs *engine;
773 unsigned long resets;
774 unsigned int flags;
775};
776
777#define TEST_ACTIVE BIT(0)
778#define TEST_OTHERS BIT(1)
779#define TEST_SELF BIT(2)
780#define TEST_PRIORITY BIT(3)
781
782static int active_request_put(struct i915_request *rq)
783{
784 int err = 0;
785
786 if (!rq)
787 return 0;
788
789 if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
790 GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
791 rq->engine->name,
792 rq->fence.context,
793 rq->fence.seqno);
794 GEM_TRACE_DUMP();
795
796 i915_gem_set_wedged(rq->i915);
797 err = -EIO;
798 }
799
800 i915_request_put(rq);
801
802 return err;
803}
804
805static int active_engine(void *data)
806{
807 I915_RND_STATE(prng);
808 struct active_engine *arg = data;
809 struct intel_engine_cs *engine = arg->engine;
810 struct i915_request *rq[8] = {};
811 struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
812 struct drm_file *file;
813 unsigned long count = 0;
814 int err = 0;
815
816 file = mock_file(engine->i915);
817 if (IS_ERR(file))
818 return PTR_ERR(file);
819
820 for (count = 0; count < ARRAY_SIZE(ctx); count++) {
821 mutex_lock(&engine->i915->drm.struct_mutex);
822 ctx[count] = live_context(engine->i915, file);
823 mutex_unlock(&engine->i915->drm.struct_mutex);
824 if (IS_ERR(ctx[count])) {
825 err = PTR_ERR(ctx[count]);
826 while (--count)
827 i915_gem_context_put(ctx[count]);
828 goto err_file;
829 }
830 }
831
832 while (!kthread_should_stop()) {
833 unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
834 struct i915_request *old = rq[idx];
835 struct i915_request *new;
836
837 mutex_lock(&engine->i915->drm.struct_mutex);
838 new = i915_request_alloc(engine, ctx[idx]);
839 if (IS_ERR(new)) {
840 mutex_unlock(&engine->i915->drm.struct_mutex);
841 err = PTR_ERR(new);
842 break;
843 }
844
845 if (arg->flags & TEST_PRIORITY)
846 ctx[idx]->sched.priority =
847 i915_prandom_u32_max_state(512, &prng);
848
849 rq[idx] = i915_request_get(new);
850 i915_request_add(new);
851 mutex_unlock(&engine->i915->drm.struct_mutex);
852
853 err = active_request_put(old);
854 if (err)
855 break;
856
857 cond_resched();
858 }
859
860 for (count = 0; count < ARRAY_SIZE(rq); count++) {
861 int err__ = active_request_put(rq[count]);
862
863
864 if (!err)
865 err = err__;
866 }
867
868err_file:
869 mock_file_free(engine->i915, file);
870 return err;
871}
872
873static int __igt_reset_engines(struct drm_i915_private *i915,
874 const char *test_name,
875 unsigned int flags)
876{
877 struct intel_engine_cs *engine, *other;
878 enum intel_engine_id id, tmp;
879 struct hang h;
880 int err = 0;
881
882
883
884
885
886 if (!intel_has_reset_engine(i915))
887 return 0;
888
889 if (flags & TEST_ACTIVE) {
890 mutex_lock(&i915->drm.struct_mutex);
891 err = hang_init(&h, i915);
892 mutex_unlock(&i915->drm.struct_mutex);
893 if (err)
894 return err;
895
896 if (flags & TEST_PRIORITY)
897 h.ctx->sched.priority = 1024;
898 }
899
900 for_each_engine(engine, i915, id) {
901 struct active_engine threads[I915_NUM_ENGINES] = {};
902 unsigned long global = i915_reset_count(&i915->gpu_error);
903 unsigned long count = 0, reported;
904 IGT_TIMEOUT(end_time);
905
906 if (flags & TEST_ACTIVE &&
907 !intel_engine_can_store_dword(engine))
908 continue;
909
910 if (!wait_for_idle(engine)) {
911 pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
912 engine->name, test_name);
913 err = -EIO;
914 break;
915 }
916
917 memset(threads, 0, sizeof(threads));
918 for_each_engine(other, i915, tmp) {
919 struct task_struct *tsk;
920
921 threads[tmp].resets =
922 i915_reset_engine_count(&i915->gpu_error,
923 other);
924
925 if (!(flags & TEST_OTHERS))
926 continue;
927
928 if (other == engine && !(flags & TEST_SELF))
929 continue;
930
931 threads[tmp].engine = other;
932 threads[tmp].flags = flags;
933
934 tsk = kthread_run(active_engine, &threads[tmp],
935 "igt/%s", other->name);
936 if (IS_ERR(tsk)) {
937 err = PTR_ERR(tsk);
938 goto unwind;
939 }
940
941 threads[tmp].task = tsk;
942 get_task_struct(tsk);
943 }
944
945 set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
946 do {
947 struct i915_request *rq = NULL;
948
949 if (flags & TEST_ACTIVE) {
950 mutex_lock(&i915->drm.struct_mutex);
951 rq = hang_create_request(&h, engine);
952 if (IS_ERR(rq)) {
953 err = PTR_ERR(rq);
954 mutex_unlock(&i915->drm.struct_mutex);
955 break;
956 }
957
958 i915_request_get(rq);
959 i915_request_add(rq);
960 mutex_unlock(&i915->drm.struct_mutex);
961
962 if (!wait_until_running(&h, rq)) {
963 struct drm_printer p = drm_info_printer(i915->drm.dev);
964
965 pr_err("%s: Failed to start request %llx, at %x\n",
966 __func__, rq->fence.seqno, hws_seqno(&h, rq));
967 intel_engine_dump(engine, &p,
968 "%s\n", engine->name);
969
970 i915_request_put(rq);
971 err = -EIO;
972 break;
973 }
974 }
975
976 err = i915_reset_engine(engine, NULL);
977 if (err) {
978 pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
979 engine->name, test_name, err);
980 break;
981 }
982
983 count++;
984
985 if (rq) {
986 if (i915_request_wait(rq, 0, HZ / 5) < 0) {
987 struct drm_printer p =
988 drm_info_printer(i915->drm.dev);
989
990 pr_err("i915_reset_engine(%s:%s):"
991 " failed to complete request after reset\n",
992 engine->name, test_name);
993 intel_engine_dump(engine, &p,
994 "%s\n", engine->name);
995 i915_request_put(rq);
996
997 GEM_TRACE_DUMP();
998 i915_gem_set_wedged(i915);
999 err = -EIO;
1000 break;
1001 }
1002
1003 i915_request_put(rq);
1004 }
1005
1006 if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1007 struct drm_printer p =
1008 drm_info_printer(i915->drm.dev);
1009
1010 pr_err("i915_reset_engine(%s:%s):"
1011 " failed to idle after reset\n",
1012 engine->name, test_name);
1013 intel_engine_dump(engine, &p,
1014 "%s\n", engine->name);
1015
1016 err = -EIO;
1017 break;
1018 }
1019 } while (time_before(jiffies, end_time));
1020 clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
1021 pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1022 engine->name, test_name, count);
1023
1024 reported = i915_reset_engine_count(&i915->gpu_error, engine);
1025 reported -= threads[engine->id].resets;
1026 if (reported != count) {
1027 pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1028 engine->name, test_name, count, reported);
1029 if (!err)
1030 err = -EINVAL;
1031 }
1032
1033unwind:
1034 for_each_engine(other, i915, tmp) {
1035 int ret;
1036
1037 if (!threads[tmp].task)
1038 continue;
1039
1040 ret = kthread_stop(threads[tmp].task);
1041 if (ret) {
1042 pr_err("kthread for other engine %s failed, err=%d\n",
1043 other->name, ret);
1044 if (!err)
1045 err = ret;
1046 }
1047 put_task_struct(threads[tmp].task);
1048
1049 if (other != engine &&
1050 threads[tmp].resets !=
1051 i915_reset_engine_count(&i915->gpu_error, other)) {
1052 pr_err("Innocent engine %s was reset (count=%ld)\n",
1053 other->name,
1054 i915_reset_engine_count(&i915->gpu_error,
1055 other) -
1056 threads[tmp].resets);
1057 if (!err)
1058 err = -EINVAL;
1059 }
1060 }
1061
1062 if (global != i915_reset_count(&i915->gpu_error)) {
1063 pr_err("Global reset (count=%ld)!\n",
1064 i915_reset_count(&i915->gpu_error) - global);
1065 if (!err)
1066 err = -EINVAL;
1067 }
1068
1069 if (err)
1070 break;
1071
1072 err = igt_flush_test(i915, 0);
1073 if (err)
1074 break;
1075 }
1076
1077 if (i915_reset_failed(i915))
1078 err = -EIO;
1079
1080 if (flags & TEST_ACTIVE) {
1081 mutex_lock(&i915->drm.struct_mutex);
1082 hang_fini(&h);
1083 mutex_unlock(&i915->drm.struct_mutex);
1084 }
1085
1086 return err;
1087}
1088
1089static int igt_reset_engines(void *arg)
1090{
1091 static const struct {
1092 const char *name;
1093 unsigned int flags;
1094 } phases[] = {
1095 { "idle", 0 },
1096 { "active", TEST_ACTIVE },
1097 { "others-idle", TEST_OTHERS },
1098 { "others-active", TEST_OTHERS | TEST_ACTIVE },
1099 {
1100 "others-priority",
1101 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1102 },
1103 {
1104 "self-priority",
1105 TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1106 },
1107 { }
1108 };
1109 struct drm_i915_private *i915 = arg;
1110 typeof(*phases) *p;
1111 int err;
1112
1113 for (p = phases; p->name; p++) {
1114 if (p->flags & TEST_PRIORITY) {
1115 if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1116 continue;
1117 }
1118
1119 err = __igt_reset_engines(arg, p->name, p->flags);
1120 if (err)
1121 return err;
1122 }
1123
1124 return 0;
1125}
1126
1127static u32 fake_hangcheck(struct drm_i915_private *i915,
1128 intel_engine_mask_t mask)
1129{
1130 u32 count = i915_reset_count(&i915->gpu_error);
1131
1132 i915_reset(i915, mask, NULL);
1133
1134 return count;
1135}
1136
1137static int igt_reset_wait(void *arg)
1138{
1139 struct drm_i915_private *i915 = arg;
1140 struct i915_request *rq;
1141 unsigned int reset_count;
1142 struct hang h;
1143 long timeout;
1144 int err;
1145
1146 if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1147 return 0;
1148
1149
1150
1151 igt_global_reset_lock(i915);
1152
1153 mutex_lock(&i915->drm.struct_mutex);
1154 err = hang_init(&h, i915);
1155 if (err)
1156 goto unlock;
1157
1158 rq = hang_create_request(&h, i915->engine[RCS0]);
1159 if (IS_ERR(rq)) {
1160 err = PTR_ERR(rq);
1161 goto fini;
1162 }
1163
1164 i915_request_get(rq);
1165 i915_request_add(rq);
1166
1167 if (!wait_until_running(&h, rq)) {
1168 struct drm_printer p = drm_info_printer(i915->drm.dev);
1169
1170 pr_err("%s: Failed to start request %llx, at %x\n",
1171 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1172 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1173
1174 i915_gem_set_wedged(i915);
1175
1176 err = -EIO;
1177 goto out_rq;
1178 }
1179
1180 reset_count = fake_hangcheck(i915, ALL_ENGINES);
1181
1182 timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
1183 if (timeout < 0) {
1184 pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1185 timeout);
1186 err = timeout;
1187 goto out_rq;
1188 }
1189
1190 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1191 pr_err("No GPU reset recorded!\n");
1192 err = -EINVAL;
1193 goto out_rq;
1194 }
1195
1196out_rq:
1197 i915_request_put(rq);
1198fini:
1199 hang_fini(&h);
1200unlock:
1201 mutex_unlock(&i915->drm.struct_mutex);
1202 igt_global_reset_unlock(i915);
1203
1204 if (i915_reset_failed(i915))
1205 return -EIO;
1206
1207 return err;
1208}
1209
1210struct evict_vma {
1211 struct completion completion;
1212 struct i915_vma *vma;
1213};
1214
1215static int evict_vma(void *data)
1216{
1217 struct evict_vma *arg = data;
1218 struct i915_address_space *vm = arg->vma->vm;
1219 struct drm_i915_private *i915 = vm->i915;
1220 struct drm_mm_node evict = arg->vma->node;
1221 int err;
1222
1223 complete(&arg->completion);
1224
1225 mutex_lock(&i915->drm.struct_mutex);
1226 err = i915_gem_evict_for_node(vm, &evict, 0);
1227 mutex_unlock(&i915->drm.struct_mutex);
1228
1229 return err;
1230}
1231
1232static int evict_fence(void *data)
1233{
1234 struct evict_vma *arg = data;
1235 struct drm_i915_private *i915 = arg->vma->vm->i915;
1236 int err;
1237
1238 complete(&arg->completion);
1239
1240 mutex_lock(&i915->drm.struct_mutex);
1241
1242
1243 err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1244 if (err) {
1245 pr_err("Invalid Y-tiling settings; err:%d\n", err);
1246 goto out_unlock;
1247 }
1248
1249 err = i915_vma_pin_fence(arg->vma);
1250 if (err) {
1251 pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1252 goto out_unlock;
1253 }
1254
1255 i915_vma_unpin_fence(arg->vma);
1256
1257out_unlock:
1258 mutex_unlock(&i915->drm.struct_mutex);
1259
1260 return err;
1261}
1262
1263static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1264 struct i915_address_space *vm,
1265 int (*fn)(void *),
1266 unsigned int flags)
1267{
1268 struct drm_i915_gem_object *obj;
1269 struct task_struct *tsk = NULL;
1270 struct i915_request *rq;
1271 struct evict_vma arg;
1272 struct hang h;
1273 int err;
1274
1275 if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1276 return 0;
1277
1278
1279
1280 mutex_lock(&i915->drm.struct_mutex);
1281 err = hang_init(&h, i915);
1282 if (err)
1283 goto unlock;
1284
1285 obj = i915_gem_object_create_internal(i915, SZ_1M);
1286 if (IS_ERR(obj)) {
1287 err = PTR_ERR(obj);
1288 goto fini;
1289 }
1290
1291 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1292 err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1293 if (err) {
1294 pr_err("Invalid X-tiling settings; err:%d\n", err);
1295 goto out_obj;
1296 }
1297 }
1298
1299 arg.vma = i915_vma_instance(obj, vm, NULL);
1300 if (IS_ERR(arg.vma)) {
1301 err = PTR_ERR(arg.vma);
1302 goto out_obj;
1303 }
1304
1305 rq = hang_create_request(&h, i915->engine[RCS0]);
1306 if (IS_ERR(rq)) {
1307 err = PTR_ERR(rq);
1308 goto out_obj;
1309 }
1310
1311 err = i915_vma_pin(arg.vma, 0, 0,
1312 i915_vma_is_ggtt(arg.vma) ?
1313 PIN_GLOBAL | PIN_MAPPABLE :
1314 PIN_USER);
1315 if (err) {
1316 i915_request_add(rq);
1317 goto out_obj;
1318 }
1319
1320 if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1321 err = i915_vma_pin_fence(arg.vma);
1322 if (err) {
1323 pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1324 i915_vma_unpin(arg.vma);
1325 i915_request_add(rq);
1326 goto out_obj;
1327 }
1328 }
1329
1330 err = i915_vma_move_to_active(arg.vma, rq, flags);
1331
1332 if (flags & EXEC_OBJECT_NEEDS_FENCE)
1333 i915_vma_unpin_fence(arg.vma);
1334 i915_vma_unpin(arg.vma);
1335
1336 i915_request_get(rq);
1337 i915_request_add(rq);
1338 if (err)
1339 goto out_rq;
1340
1341 mutex_unlock(&i915->drm.struct_mutex);
1342
1343 if (!wait_until_running(&h, rq)) {
1344 struct drm_printer p = drm_info_printer(i915->drm.dev);
1345
1346 pr_err("%s: Failed to start request %llx, at %x\n",
1347 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1348 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1349
1350 i915_gem_set_wedged(i915);
1351 goto out_reset;
1352 }
1353
1354 init_completion(&arg.completion);
1355
1356 tsk = kthread_run(fn, &arg, "igt/evict_vma");
1357 if (IS_ERR(tsk)) {
1358 err = PTR_ERR(tsk);
1359 tsk = NULL;
1360 goto out_reset;
1361 }
1362 get_task_struct(tsk);
1363
1364 wait_for_completion(&arg.completion);
1365
1366 if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1367 struct drm_printer p = drm_info_printer(i915->drm.dev);
1368
1369 pr_err("igt/evict_vma kthread did not wait\n");
1370 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1371
1372 i915_gem_set_wedged(i915);
1373 goto out_reset;
1374 }
1375
1376out_reset:
1377 igt_global_reset_lock(i915);
1378 fake_hangcheck(rq->i915, rq->engine->mask);
1379 igt_global_reset_unlock(i915);
1380
1381 if (tsk) {
1382 struct igt_wedge_me w;
1383
1384
1385 igt_wedge_on_timeout(&w, i915, HZ / 10 )
1386 err = kthread_stop(tsk);
1387
1388 put_task_struct(tsk);
1389 }
1390
1391 mutex_lock(&i915->drm.struct_mutex);
1392out_rq:
1393 i915_request_put(rq);
1394out_obj:
1395 i915_gem_object_put(obj);
1396fini:
1397 hang_fini(&h);
1398unlock:
1399 mutex_unlock(&i915->drm.struct_mutex);
1400
1401 if (i915_reset_failed(i915))
1402 return -EIO;
1403
1404 return err;
1405}
1406
1407static int igt_reset_evict_ggtt(void *arg)
1408{
1409 struct drm_i915_private *i915 = arg;
1410
1411 return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1412 evict_vma, EXEC_OBJECT_WRITE);
1413}
1414
1415static int igt_reset_evict_ppgtt(void *arg)
1416{
1417 struct drm_i915_private *i915 = arg;
1418 struct i915_gem_context *ctx;
1419 struct drm_file *file;
1420 int err;
1421
1422 file = mock_file(i915);
1423 if (IS_ERR(file))
1424 return PTR_ERR(file);
1425
1426 mutex_lock(&i915->drm.struct_mutex);
1427 ctx = live_context(i915, file);
1428 mutex_unlock(&i915->drm.struct_mutex);
1429 if (IS_ERR(ctx)) {
1430 err = PTR_ERR(ctx);
1431 goto out;
1432 }
1433
1434 err = 0;
1435 if (ctx->ppgtt)
1436 err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1437 evict_vma, EXEC_OBJECT_WRITE);
1438
1439out:
1440 mock_file_free(i915, file);
1441 return err;
1442}
1443
1444static int igt_reset_evict_fence(void *arg)
1445{
1446 struct drm_i915_private *i915 = arg;
1447
1448 return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1449 evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1450}
1451
1452static int wait_for_others(struct drm_i915_private *i915,
1453 struct intel_engine_cs *exclude)
1454{
1455 struct intel_engine_cs *engine;
1456 enum intel_engine_id id;
1457
1458 for_each_engine(engine, i915, id) {
1459 if (engine == exclude)
1460 continue;
1461
1462 if (!wait_for_idle(engine))
1463 return -EIO;
1464 }
1465
1466 return 0;
1467}
1468
1469static int igt_reset_queue(void *arg)
1470{
1471 struct drm_i915_private *i915 = arg;
1472 struct intel_engine_cs *engine;
1473 enum intel_engine_id id;
1474 struct hang h;
1475 int err;
1476
1477
1478
1479 igt_global_reset_lock(i915);
1480
1481 mutex_lock(&i915->drm.struct_mutex);
1482 err = hang_init(&h, i915);
1483 if (err)
1484 goto unlock;
1485
1486 for_each_engine(engine, i915, id) {
1487 struct i915_request *prev;
1488 IGT_TIMEOUT(end_time);
1489 unsigned int count;
1490
1491 if (!intel_engine_can_store_dword(engine))
1492 continue;
1493
1494 prev = hang_create_request(&h, engine);
1495 if (IS_ERR(prev)) {
1496 err = PTR_ERR(prev);
1497 goto fini;
1498 }
1499
1500 i915_request_get(prev);
1501 i915_request_add(prev);
1502
1503 count = 0;
1504 do {
1505 struct i915_request *rq;
1506 unsigned int reset_count;
1507
1508 rq = hang_create_request(&h, engine);
1509 if (IS_ERR(rq)) {
1510 err = PTR_ERR(rq);
1511 goto fini;
1512 }
1513
1514 i915_request_get(rq);
1515 i915_request_add(rq);
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527 err = wait_for_others(i915, engine);
1528 if (err) {
1529 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1530 __func__, engine->name);
1531 i915_request_put(rq);
1532 i915_request_put(prev);
1533
1534 GEM_TRACE_DUMP();
1535 i915_gem_set_wedged(i915);
1536 goto fini;
1537 }
1538
1539 if (!wait_until_running(&h, prev)) {
1540 struct drm_printer p = drm_info_printer(i915->drm.dev);
1541
1542 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1543 __func__, engine->name,
1544 prev->fence.seqno, hws_seqno(&h, prev));
1545 intel_engine_dump(engine, &p,
1546 "%s\n", engine->name);
1547
1548 i915_request_put(rq);
1549 i915_request_put(prev);
1550
1551 i915_gem_set_wedged(i915);
1552
1553 err = -EIO;
1554 goto fini;
1555 }
1556
1557 reset_count = fake_hangcheck(i915, BIT(id));
1558
1559 if (prev->fence.error != -EIO) {
1560 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1561 prev->fence.error);
1562 i915_request_put(rq);
1563 i915_request_put(prev);
1564 err = -EINVAL;
1565 goto fini;
1566 }
1567
1568 if (rq->fence.error) {
1569 pr_err("Fence error status not zero [%d] after unrelated reset\n",
1570 rq->fence.error);
1571 i915_request_put(rq);
1572 i915_request_put(prev);
1573 err = -EINVAL;
1574 goto fini;
1575 }
1576
1577 if (i915_reset_count(&i915->gpu_error) == reset_count) {
1578 pr_err("No GPU reset recorded!\n");
1579 i915_request_put(rq);
1580 i915_request_put(prev);
1581 err = -EINVAL;
1582 goto fini;
1583 }
1584
1585 i915_request_put(prev);
1586 prev = rq;
1587 count++;
1588 } while (time_before(jiffies, end_time));
1589 pr_info("%s: Completed %d resets\n", engine->name, count);
1590
1591 *h.batch = MI_BATCH_BUFFER_END;
1592 i915_gem_chipset_flush(i915);
1593
1594 i915_request_put(prev);
1595
1596 err = igt_flush_test(i915, I915_WAIT_LOCKED);
1597 if (err)
1598 break;
1599 }
1600
1601fini:
1602 hang_fini(&h);
1603unlock:
1604 mutex_unlock(&i915->drm.struct_mutex);
1605 igt_global_reset_unlock(i915);
1606
1607 if (i915_reset_failed(i915))
1608 return -EIO;
1609
1610 return err;
1611}
1612
1613static int igt_handle_error(void *arg)
1614{
1615 struct drm_i915_private *i915 = arg;
1616 struct intel_engine_cs *engine = i915->engine[RCS0];
1617 struct hang h;
1618 struct i915_request *rq;
1619 struct i915_gpu_state *error;
1620 int err;
1621
1622
1623
1624 if (!intel_has_reset_engine(i915))
1625 return 0;
1626
1627 if (!engine || !intel_engine_can_store_dword(engine))
1628 return 0;
1629
1630 mutex_lock(&i915->drm.struct_mutex);
1631
1632 err = hang_init(&h, i915);
1633 if (err)
1634 goto err_unlock;
1635
1636 rq = hang_create_request(&h, engine);
1637 if (IS_ERR(rq)) {
1638 err = PTR_ERR(rq);
1639 goto err_fini;
1640 }
1641
1642 i915_request_get(rq);
1643 i915_request_add(rq);
1644
1645 if (!wait_until_running(&h, rq)) {
1646 struct drm_printer p = drm_info_printer(i915->drm.dev);
1647
1648 pr_err("%s: Failed to start request %llx, at %x\n",
1649 __func__, rq->fence.seqno, hws_seqno(&h, rq));
1650 intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1651
1652 i915_gem_set_wedged(i915);
1653
1654 err = -EIO;
1655 goto err_request;
1656 }
1657
1658 mutex_unlock(&i915->drm.struct_mutex);
1659
1660
1661 error = xchg(&i915->gpu_error.first_error, (void *)-1);
1662
1663 i915_handle_error(i915, engine->mask, 0, NULL);
1664
1665 xchg(&i915->gpu_error.first_error, error);
1666
1667 mutex_lock(&i915->drm.struct_mutex);
1668
1669 if (rq->fence.error != -EIO) {
1670 pr_err("Guilty request not identified!\n");
1671 err = -EINVAL;
1672 goto err_request;
1673 }
1674
1675err_request:
1676 i915_request_put(rq);
1677err_fini:
1678 hang_fini(&h);
1679err_unlock:
1680 mutex_unlock(&i915->drm.struct_mutex);
1681 return err;
1682}
1683
1684static void __preempt_begin(void)
1685{
1686 preempt_disable();
1687}
1688
1689static void __preempt_end(void)
1690{
1691 preempt_enable();
1692}
1693
1694static void __softirq_begin(void)
1695{
1696 local_bh_disable();
1697}
1698
1699static void __softirq_end(void)
1700{
1701 local_bh_enable();
1702}
1703
1704static void __hardirq_begin(void)
1705{
1706 local_irq_disable();
1707}
1708
1709static void __hardirq_end(void)
1710{
1711 local_irq_enable();
1712}
1713
1714struct atomic_section {
1715 const char *name;
1716 void (*critical_section_begin)(void);
1717 void (*critical_section_end)(void);
1718};
1719
1720static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1721 const struct atomic_section *p,
1722 const char *mode)
1723{
1724 struct tasklet_struct * const t = &engine->execlists.tasklet;
1725 int err;
1726
1727 GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1728 engine->name, mode, p->name);
1729
1730 tasklet_disable_nosync(t);
1731 p->critical_section_begin();
1732
1733 err = i915_reset_engine(engine, NULL);
1734
1735 p->critical_section_end();
1736 tasklet_enable(t);
1737
1738 if (err)
1739 pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1740 engine->name, mode, p->name);
1741
1742 return err;
1743}
1744
1745static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1746 const struct atomic_section *p)
1747{
1748 struct drm_i915_private *i915 = engine->i915;
1749 struct i915_request *rq;
1750 struct hang h;
1751 int err;
1752
1753 err = __igt_atomic_reset_engine(engine, p, "idle");
1754 if (err)
1755 return err;
1756
1757 err = hang_init(&h, i915);
1758 if (err)
1759 return err;
1760
1761 rq = hang_create_request(&h, engine);
1762 if (IS_ERR(rq)) {
1763 err = PTR_ERR(rq);
1764 goto out;
1765 }
1766
1767 i915_request_get(rq);
1768 i915_request_add(rq);
1769
1770 if (wait_until_running(&h, rq)) {
1771 err = __igt_atomic_reset_engine(engine, p, "active");
1772 } else {
1773 pr_err("%s(%s): Failed to start request %llx, at %x\n",
1774 __func__, engine->name,
1775 rq->fence.seqno, hws_seqno(&h, rq));
1776 i915_gem_set_wedged(i915);
1777 err = -EIO;
1778 }
1779
1780 if (err == 0) {
1781 struct igt_wedge_me w;
1782
1783 igt_wedge_on_timeout(&w, i915, HZ / 20 )
1784 i915_request_wait(rq,
1785 I915_WAIT_LOCKED,
1786 MAX_SCHEDULE_TIMEOUT);
1787 if (i915_reset_failed(i915))
1788 err = -EIO;
1789 }
1790
1791 i915_request_put(rq);
1792out:
1793 hang_fini(&h);
1794 return err;
1795}
1796
1797static void force_reset(struct drm_i915_private *i915)
1798{
1799 i915_gem_set_wedged(i915);
1800 i915_reset(i915, 0, NULL);
1801}
1802
1803static int igt_atomic_reset(void *arg)
1804{
1805 static const struct atomic_section phases[] = {
1806 { "preempt", __preempt_begin, __preempt_end },
1807 { "softirq", __softirq_begin, __softirq_end },
1808 { "hardirq", __hardirq_begin, __hardirq_end },
1809 { }
1810 };
1811 struct drm_i915_private *i915 = arg;
1812 intel_wakeref_t wakeref;
1813 int err = 0;
1814
1815
1816
1817 if (USES_GUC_SUBMISSION(i915))
1818 return 0;
1819
1820 igt_global_reset_lock(i915);
1821 mutex_lock(&i915->drm.struct_mutex);
1822 wakeref = intel_runtime_pm_get(i915);
1823
1824
1825 force_reset(i915);
1826 if (i915_reset_failed(i915))
1827 goto unlock;
1828
1829 if (intel_has_gpu_reset(i915)) {
1830 const typeof(*phases) *p;
1831
1832 for (p = phases; p->name; p++) {
1833 GEM_TRACE("intel_gpu_reset under %s\n", p->name);
1834
1835 p->critical_section_begin();
1836 err = intel_gpu_reset(i915, ALL_ENGINES);
1837 p->critical_section_end();
1838
1839 if (err) {
1840 pr_err("intel_gpu_reset failed under %s\n",
1841 p->name);
1842 goto out;
1843 }
1844 }
1845
1846 force_reset(i915);
1847 }
1848
1849 if (intel_has_reset_engine(i915)) {
1850 struct intel_engine_cs *engine;
1851 enum intel_engine_id id;
1852
1853 for_each_engine(engine, i915, id) {
1854 const typeof(*phases) *p;
1855
1856 for (p = phases; p->name; p++) {
1857 err = igt_atomic_reset_engine(engine, p);
1858 if (err)
1859 goto out;
1860 }
1861 }
1862 }
1863
1864out:
1865
1866 force_reset(i915);
1867
1868unlock:
1869 intel_runtime_pm_put(i915, wakeref);
1870 mutex_unlock(&i915->drm.struct_mutex);
1871 igt_global_reset_unlock(i915);
1872
1873 return err;
1874}
1875
1876int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1877{
1878 static const struct i915_subtest tests[] = {
1879 SUBTEST(igt_global_reset),
1880 SUBTEST(igt_wedged_reset),
1881 SUBTEST(igt_hang_sanitycheck),
1882 SUBTEST(igt_reset_nop),
1883 SUBTEST(igt_reset_nop_engine),
1884 SUBTEST(igt_reset_idle_engine),
1885 SUBTEST(igt_reset_active_engine),
1886 SUBTEST(igt_reset_engines),
1887 SUBTEST(igt_reset_queue),
1888 SUBTEST(igt_reset_wait),
1889 SUBTEST(igt_reset_evict_ggtt),
1890 SUBTEST(igt_reset_evict_ppgtt),
1891 SUBTEST(igt_reset_evict_fence),
1892 SUBTEST(igt_handle_error),
1893 SUBTEST(igt_atomic_reset),
1894 };
1895 intel_wakeref_t wakeref;
1896 bool saved_hangcheck;
1897 int err;
1898
1899 if (!intel_has_gpu_reset(i915))
1900 return 0;
1901
1902 if (i915_terminally_wedged(i915))
1903 return -EIO;
1904
1905 wakeref = intel_runtime_pm_get(i915);
1906 saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1907 drain_delayed_work(&i915->gpu_error.hangcheck_work);
1908
1909 err = i915_subtests(tests, i915);
1910
1911 mutex_lock(&i915->drm.struct_mutex);
1912 igt_flush_test(i915, I915_WAIT_LOCKED);
1913 mutex_unlock(&i915->drm.struct_mutex);
1914
1915 i915_modparams.enable_hangcheck = saved_hangcheck;
1916 intel_runtime_pm_put(i915, wakeref);
1917
1918 return err;
1919}
1920