linux/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "../i915_selftest.h"
  28#include "i915_random.h"
  29#include "igt_flush_test.h"
  30#include "igt_wedge_me.h"
  31
  32#include "mock_context.h"
  33#include "mock_drm.h"
  34
  35#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  36
  37struct hang {
  38        struct drm_i915_private *i915;
  39        struct drm_i915_gem_object *hws;
  40        struct drm_i915_gem_object *obj;
  41        struct i915_gem_context *ctx;
  42        u32 *seqno;
  43        u32 *batch;
  44};
  45
  46static int hang_init(struct hang *h, struct drm_i915_private *i915)
  47{
  48        void *vaddr;
  49        int err;
  50
  51        memset(h, 0, sizeof(*h));
  52        h->i915 = i915;
  53
  54        h->ctx = kernel_context(i915);
  55        if (IS_ERR(h->ctx))
  56                return PTR_ERR(h->ctx);
  57
  58        h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  59        if (IS_ERR(h->hws)) {
  60                err = PTR_ERR(h->hws);
  61                goto err_ctx;
  62        }
  63
  64        h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  65        if (IS_ERR(h->obj)) {
  66                err = PTR_ERR(h->obj);
  67                goto err_hws;
  68        }
  69
  70        i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  71        vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  72        if (IS_ERR(vaddr)) {
  73                err = PTR_ERR(vaddr);
  74                goto err_obj;
  75        }
  76        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  77
  78        vaddr = i915_gem_object_pin_map(h->obj,
  79                                        HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  80        if (IS_ERR(vaddr)) {
  81                err = PTR_ERR(vaddr);
  82                goto err_unpin_hws;
  83        }
  84        h->batch = vaddr;
  85
  86        return 0;
  87
  88err_unpin_hws:
  89        i915_gem_object_unpin_map(h->hws);
  90err_obj:
  91        i915_gem_object_put(h->obj);
  92err_hws:
  93        i915_gem_object_put(h->hws);
  94err_ctx:
  95        kernel_context_close(h->ctx);
  96        return err;
  97}
  98
  99static u64 hws_address(const struct i915_vma *hws,
 100                       const struct i915_request *rq)
 101{
 102        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
 103}
 104
 105static int emit_recurse_batch(struct hang *h,
 106                              struct i915_request *rq)
 107{
 108        struct drm_i915_private *i915 = h->i915;
 109        struct i915_address_space *vm =
 110                rq->gem_context->ppgtt ?
 111                &rq->gem_context->ppgtt->vm :
 112                &i915->ggtt.vm;
 113        struct i915_vma *hws, *vma;
 114        unsigned int flags;
 115        u32 *batch;
 116        int err;
 117
 118        vma = i915_vma_instance(h->obj, vm, NULL);
 119        if (IS_ERR(vma))
 120                return PTR_ERR(vma);
 121
 122        hws = i915_vma_instance(h->hws, vm, NULL);
 123        if (IS_ERR(hws))
 124                return PTR_ERR(hws);
 125
 126        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 127        if (err)
 128                return err;
 129
 130        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 131        if (err)
 132                goto unpin_vma;
 133
 134        err = i915_vma_move_to_active(vma, rq, 0);
 135        if (err)
 136                goto unpin_hws;
 137
 138        if (!i915_gem_object_has_active_reference(vma->obj)) {
 139                i915_gem_object_get(vma->obj);
 140                i915_gem_object_set_active_reference(vma->obj);
 141        }
 142
 143        err = i915_vma_move_to_active(hws, rq, 0);
 144        if (err)
 145                goto unpin_hws;
 146
 147        if (!i915_gem_object_has_active_reference(hws->obj)) {
 148                i915_gem_object_get(hws->obj);
 149                i915_gem_object_set_active_reference(hws->obj);
 150        }
 151
 152        batch = h->batch;
 153        if (INTEL_GEN(i915) >= 8) {
 154                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 155                *batch++ = lower_32_bits(hws_address(hws, rq));
 156                *batch++ = upper_32_bits(hws_address(hws, rq));
 157                *batch++ = rq->fence.seqno;
 158                *batch++ = MI_ARB_CHECK;
 159
 160                memset(batch, 0, 1024);
 161                batch += 1024 / sizeof(*batch);
 162
 163                *batch++ = MI_ARB_CHECK;
 164                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 165                *batch++ = lower_32_bits(vma->node.start);
 166                *batch++ = upper_32_bits(vma->node.start);
 167        } else if (INTEL_GEN(i915) >= 6) {
 168                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 169                *batch++ = 0;
 170                *batch++ = lower_32_bits(hws_address(hws, rq));
 171                *batch++ = rq->fence.seqno;
 172                *batch++ = MI_ARB_CHECK;
 173
 174                memset(batch, 0, 1024);
 175                batch += 1024 / sizeof(*batch);
 176
 177                *batch++ = MI_ARB_CHECK;
 178                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 179                *batch++ = lower_32_bits(vma->node.start);
 180        } else if (INTEL_GEN(i915) >= 4) {
 181                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 182                *batch++ = 0;
 183                *batch++ = lower_32_bits(hws_address(hws, rq));
 184                *batch++ = rq->fence.seqno;
 185                *batch++ = MI_ARB_CHECK;
 186
 187                memset(batch, 0, 1024);
 188                batch += 1024 / sizeof(*batch);
 189
 190                *batch++ = MI_ARB_CHECK;
 191                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 192                *batch++ = lower_32_bits(vma->node.start);
 193        } else {
 194                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 195                *batch++ = lower_32_bits(hws_address(hws, rq));
 196                *batch++ = rq->fence.seqno;
 197                *batch++ = MI_ARB_CHECK;
 198
 199                memset(batch, 0, 1024);
 200                batch += 1024 / sizeof(*batch);
 201
 202                *batch++ = MI_ARB_CHECK;
 203                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 204                *batch++ = lower_32_bits(vma->node.start);
 205        }
 206        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 207        i915_gem_chipset_flush(h->i915);
 208
 209        flags = 0;
 210        if (INTEL_GEN(vm->i915) <= 5)
 211                flags |= I915_DISPATCH_SECURE;
 212
 213        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 214
 215unpin_hws:
 216        i915_vma_unpin(hws);
 217unpin_vma:
 218        i915_vma_unpin(vma);
 219        return err;
 220}
 221
 222static struct i915_request *
 223hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 224{
 225        struct i915_request *rq;
 226        int err;
 227
 228        if (i915_gem_object_is_active(h->obj)) {
 229                struct drm_i915_gem_object *obj;
 230                void *vaddr;
 231
 232                obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 233                if (IS_ERR(obj))
 234                        return ERR_CAST(obj);
 235
 236                vaddr = i915_gem_object_pin_map(obj,
 237                                                HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 238                if (IS_ERR(vaddr)) {
 239                        i915_gem_object_put(obj);
 240                        return ERR_CAST(vaddr);
 241                }
 242
 243                i915_gem_object_unpin_map(h->obj);
 244                i915_gem_object_put(h->obj);
 245
 246                h->obj = obj;
 247                h->batch = vaddr;
 248        }
 249
 250        rq = i915_request_alloc(engine, h->ctx);
 251        if (IS_ERR(rq))
 252                return rq;
 253
 254        err = emit_recurse_batch(h, rq);
 255        if (err) {
 256                i915_request_add(rq);
 257                return ERR_PTR(err);
 258        }
 259
 260        return rq;
 261}
 262
 263static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 264{
 265        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 266}
 267
 268static void hang_fini(struct hang *h)
 269{
 270        *h->batch = MI_BATCH_BUFFER_END;
 271        i915_gem_chipset_flush(h->i915);
 272
 273        i915_gem_object_unpin_map(h->obj);
 274        i915_gem_object_put(h->obj);
 275
 276        i915_gem_object_unpin_map(h->hws);
 277        i915_gem_object_put(h->hws);
 278
 279        kernel_context_close(h->ctx);
 280
 281        igt_flush_test(h->i915, I915_WAIT_LOCKED);
 282}
 283
 284static bool wait_until_running(struct hang *h, struct i915_request *rq)
 285{
 286        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 287                                               rq->fence.seqno),
 288                             10) &&
 289                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 290                                            rq->fence.seqno),
 291                          1000));
 292}
 293
 294static int igt_hang_sanitycheck(void *arg)
 295{
 296        struct drm_i915_private *i915 = arg;
 297        struct i915_request *rq;
 298        struct intel_engine_cs *engine;
 299        enum intel_engine_id id;
 300        struct hang h;
 301        int err;
 302
 303        /* Basic check that we can execute our hanging batch */
 304
 305        mutex_lock(&i915->drm.struct_mutex);
 306        err = hang_init(&h, i915);
 307        if (err)
 308                goto unlock;
 309
 310        for_each_engine(engine, i915, id) {
 311                long timeout;
 312
 313                if (!intel_engine_can_store_dword(engine))
 314                        continue;
 315
 316                rq = hang_create_request(&h, engine);
 317                if (IS_ERR(rq)) {
 318                        err = PTR_ERR(rq);
 319                        pr_err("Failed to create request for %s, err=%d\n",
 320                               engine->name, err);
 321                        goto fini;
 322                }
 323
 324                i915_request_get(rq);
 325
 326                *h.batch = MI_BATCH_BUFFER_END;
 327                i915_gem_chipset_flush(i915);
 328
 329                i915_request_add(rq);
 330
 331                timeout = i915_request_wait(rq,
 332                                            I915_WAIT_LOCKED,
 333                                            MAX_SCHEDULE_TIMEOUT);
 334                i915_request_put(rq);
 335
 336                if (timeout < 0) {
 337                        err = timeout;
 338                        pr_err("Wait for request failed on %s, err=%d\n",
 339                               engine->name, err);
 340                        goto fini;
 341                }
 342        }
 343
 344fini:
 345        hang_fini(&h);
 346unlock:
 347        mutex_unlock(&i915->drm.struct_mutex);
 348        return err;
 349}
 350
 351static void global_reset_lock(struct drm_i915_private *i915)
 352{
 353        struct intel_engine_cs *engine;
 354        enum intel_engine_id id;
 355
 356        pr_debug("%s: current gpu_error=%08lx\n",
 357                 __func__, i915->gpu_error.flags);
 358
 359        while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 360                wait_event(i915->gpu_error.reset_queue,
 361                           !test_bit(I915_RESET_BACKOFF,
 362                                     &i915->gpu_error.flags));
 363
 364        for_each_engine(engine, i915, id) {
 365                while (test_and_set_bit(I915_RESET_ENGINE + id,
 366                                        &i915->gpu_error.flags))
 367                        wait_on_bit(&i915->gpu_error.flags,
 368                                    I915_RESET_ENGINE + id,
 369                                    TASK_UNINTERRUPTIBLE);
 370        }
 371}
 372
 373static void global_reset_unlock(struct drm_i915_private *i915)
 374{
 375        struct intel_engine_cs *engine;
 376        enum intel_engine_id id;
 377
 378        for_each_engine(engine, i915, id)
 379                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 380
 381        clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 382        wake_up_all(&i915->gpu_error.reset_queue);
 383}
 384
 385static int igt_global_reset(void *arg)
 386{
 387        struct drm_i915_private *i915 = arg;
 388        unsigned int reset_count;
 389        int err = 0;
 390
 391        /* Check that we can issue a global GPU reset */
 392
 393        global_reset_lock(i915);
 394        set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 395
 396        mutex_lock(&i915->drm.struct_mutex);
 397        reset_count = i915_reset_count(&i915->gpu_error);
 398
 399        i915_reset(i915, ALL_ENGINES, NULL);
 400
 401        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 402                pr_err("No GPU reset recorded!\n");
 403                err = -EINVAL;
 404        }
 405        mutex_unlock(&i915->drm.struct_mutex);
 406
 407        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 408        global_reset_unlock(i915);
 409
 410        if (i915_terminally_wedged(&i915->gpu_error))
 411                err = -EIO;
 412
 413        return err;
 414}
 415
 416static bool wait_for_idle(struct intel_engine_cs *engine)
 417{
 418        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 419}
 420
 421static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 422{
 423        struct intel_engine_cs *engine;
 424        enum intel_engine_id id;
 425        struct hang h;
 426        int err = 0;
 427
 428        /* Check that we can issue an engine reset on an idle engine (no-op) */
 429
 430        if (!intel_has_reset_engine(i915))
 431                return 0;
 432
 433        if (active) {
 434                mutex_lock(&i915->drm.struct_mutex);
 435                err = hang_init(&h, i915);
 436                mutex_unlock(&i915->drm.struct_mutex);
 437                if (err)
 438                        return err;
 439        }
 440
 441        for_each_engine(engine, i915, id) {
 442                unsigned int reset_count, reset_engine_count;
 443                IGT_TIMEOUT(end_time);
 444
 445                if (active && !intel_engine_can_store_dword(engine))
 446                        continue;
 447
 448                if (!wait_for_idle(engine)) {
 449                        pr_err("%s failed to idle before reset\n",
 450                               engine->name);
 451                        err = -EIO;
 452                        break;
 453                }
 454
 455                reset_count = i915_reset_count(&i915->gpu_error);
 456                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 457                                                             engine);
 458
 459                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 460                do {
 461                        u32 seqno = intel_engine_get_seqno(engine);
 462
 463                        if (active) {
 464                                struct i915_request *rq;
 465
 466                                mutex_lock(&i915->drm.struct_mutex);
 467                                rq = hang_create_request(&h, engine);
 468                                if (IS_ERR(rq)) {
 469                                        err = PTR_ERR(rq);
 470                                        mutex_unlock(&i915->drm.struct_mutex);
 471                                        break;
 472                                }
 473
 474                                i915_request_get(rq);
 475                                i915_request_add(rq);
 476                                mutex_unlock(&i915->drm.struct_mutex);
 477
 478                                if (!wait_until_running(&h, rq)) {
 479                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 480
 481                                        pr_err("%s: Failed to start request %x, at %x\n",
 482                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 483                                        intel_engine_dump(engine, &p,
 484                                                          "%s\n", engine->name);
 485
 486                                        i915_request_put(rq);
 487                                        err = -EIO;
 488                                        break;
 489                                }
 490
 491                                GEM_BUG_ON(!rq->global_seqno);
 492                                seqno = rq->global_seqno - 1;
 493                                i915_request_put(rq);
 494                        }
 495
 496                        err = i915_reset_engine(engine, NULL);
 497                        if (err) {
 498                                pr_err("i915_reset_engine failed\n");
 499                                break;
 500                        }
 501
 502                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 503                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 504                                err = -EINVAL;
 505                                break;
 506                        }
 507
 508                        reset_engine_count += active;
 509                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 510                            reset_engine_count) {
 511                                pr_err("%s engine reset %srecorded!\n",
 512                                       engine->name, active ? "not " : "");
 513                                err = -EINVAL;
 514                                break;
 515                        }
 516
 517                        if (!wait_for_idle(engine)) {
 518                                struct drm_printer p =
 519                                        drm_info_printer(i915->drm.dev);
 520
 521                                pr_err("%s failed to idle after reset\n",
 522                                       engine->name);
 523                                intel_engine_dump(engine, &p,
 524                                                  "%s\n", engine->name);
 525
 526                                err = -EIO;
 527                                break;
 528                        }
 529                } while (time_before(jiffies, end_time));
 530                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 531
 532                if (err)
 533                        break;
 534
 535                err = igt_flush_test(i915, 0);
 536                if (err)
 537                        break;
 538        }
 539
 540        if (i915_terminally_wedged(&i915->gpu_error))
 541                err = -EIO;
 542
 543        if (active) {
 544                mutex_lock(&i915->drm.struct_mutex);
 545                hang_fini(&h);
 546                mutex_unlock(&i915->drm.struct_mutex);
 547        }
 548
 549        return err;
 550}
 551
 552static int igt_reset_idle_engine(void *arg)
 553{
 554        return __igt_reset_engine(arg, false);
 555}
 556
 557static int igt_reset_active_engine(void *arg)
 558{
 559        return __igt_reset_engine(arg, true);
 560}
 561
 562struct active_engine {
 563        struct task_struct *task;
 564        struct intel_engine_cs *engine;
 565        unsigned long resets;
 566        unsigned int flags;
 567};
 568
 569#define TEST_ACTIVE     BIT(0)
 570#define TEST_OTHERS     BIT(1)
 571#define TEST_SELF       BIT(2)
 572#define TEST_PRIORITY   BIT(3)
 573
 574static int active_request_put(struct i915_request *rq)
 575{
 576        int err = 0;
 577
 578        if (!rq)
 579                return 0;
 580
 581        if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 582                GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
 583                          rq->engine->name,
 584                          rq->fence.context,
 585                          rq->fence.seqno,
 586                          i915_request_global_seqno(rq));
 587                GEM_TRACE_DUMP();
 588
 589                i915_gem_set_wedged(rq->i915);
 590                err = -EIO;
 591        }
 592
 593        i915_request_put(rq);
 594
 595        return err;
 596}
 597
 598static int active_engine(void *data)
 599{
 600        I915_RND_STATE(prng);
 601        struct active_engine *arg = data;
 602        struct intel_engine_cs *engine = arg->engine;
 603        struct i915_request *rq[8] = {};
 604        struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 605        struct drm_file *file;
 606        unsigned long count = 0;
 607        int err = 0;
 608
 609        file = mock_file(engine->i915);
 610        if (IS_ERR(file))
 611                return PTR_ERR(file);
 612
 613        for (count = 0; count < ARRAY_SIZE(ctx); count++) {
 614                mutex_lock(&engine->i915->drm.struct_mutex);
 615                ctx[count] = live_context(engine->i915, file);
 616                mutex_unlock(&engine->i915->drm.struct_mutex);
 617                if (IS_ERR(ctx[count])) {
 618                        err = PTR_ERR(ctx[count]);
 619                        while (--count)
 620                                i915_gem_context_put(ctx[count]);
 621                        goto err_file;
 622                }
 623        }
 624
 625        while (!kthread_should_stop()) {
 626                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 627                struct i915_request *old = rq[idx];
 628                struct i915_request *new;
 629
 630                mutex_lock(&engine->i915->drm.struct_mutex);
 631                new = i915_request_alloc(engine, ctx[idx]);
 632                if (IS_ERR(new)) {
 633                        mutex_unlock(&engine->i915->drm.struct_mutex);
 634                        err = PTR_ERR(new);
 635                        break;
 636                }
 637
 638                if (arg->flags & TEST_PRIORITY)
 639                        ctx[idx]->sched.priority =
 640                                i915_prandom_u32_max_state(512, &prng);
 641
 642                rq[idx] = i915_request_get(new);
 643                i915_request_add(new);
 644                mutex_unlock(&engine->i915->drm.struct_mutex);
 645
 646                err = active_request_put(old);
 647                if (err)
 648                        break;
 649
 650                cond_resched();
 651        }
 652
 653        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 654                int err__ = active_request_put(rq[count]);
 655
 656                /* Keep the first error */
 657                if (!err)
 658                        err = err__;
 659        }
 660
 661err_file:
 662        mock_file_free(engine->i915, file);
 663        return err;
 664}
 665
 666static int __igt_reset_engines(struct drm_i915_private *i915,
 667                               const char *test_name,
 668                               unsigned int flags)
 669{
 670        struct intel_engine_cs *engine, *other;
 671        enum intel_engine_id id, tmp;
 672        struct hang h;
 673        int err = 0;
 674
 675        /* Check that issuing a reset on one engine does not interfere
 676         * with any other engine.
 677         */
 678
 679        if (!intel_has_reset_engine(i915))
 680                return 0;
 681
 682        if (flags & TEST_ACTIVE) {
 683                mutex_lock(&i915->drm.struct_mutex);
 684                err = hang_init(&h, i915);
 685                mutex_unlock(&i915->drm.struct_mutex);
 686                if (err)
 687                        return err;
 688
 689                if (flags & TEST_PRIORITY)
 690                        h.ctx->sched.priority = 1024;
 691        }
 692
 693        for_each_engine(engine, i915, id) {
 694                struct active_engine threads[I915_NUM_ENGINES] = {};
 695                unsigned long global = i915_reset_count(&i915->gpu_error);
 696                unsigned long count = 0, reported;
 697                IGT_TIMEOUT(end_time);
 698
 699                if (flags & TEST_ACTIVE &&
 700                    !intel_engine_can_store_dword(engine))
 701                        continue;
 702
 703                if (!wait_for_idle(engine)) {
 704                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 705                               engine->name, test_name);
 706                        err = -EIO;
 707                        break;
 708                }
 709
 710                memset(threads, 0, sizeof(threads));
 711                for_each_engine(other, i915, tmp) {
 712                        struct task_struct *tsk;
 713
 714                        threads[tmp].resets =
 715                                i915_reset_engine_count(&i915->gpu_error,
 716                                                        other);
 717
 718                        if (!(flags & TEST_OTHERS))
 719                                continue;
 720
 721                        if (other == engine && !(flags & TEST_SELF))
 722                                continue;
 723
 724                        threads[tmp].engine = other;
 725                        threads[tmp].flags = flags;
 726
 727                        tsk = kthread_run(active_engine, &threads[tmp],
 728                                          "igt/%s", other->name);
 729                        if (IS_ERR(tsk)) {
 730                                err = PTR_ERR(tsk);
 731                                goto unwind;
 732                        }
 733
 734                        threads[tmp].task = tsk;
 735                        get_task_struct(tsk);
 736                }
 737
 738                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 739                do {
 740                        u32 seqno = intel_engine_get_seqno(engine);
 741                        struct i915_request *rq = NULL;
 742
 743                        if (flags & TEST_ACTIVE) {
 744                                mutex_lock(&i915->drm.struct_mutex);
 745                                rq = hang_create_request(&h, engine);
 746                                if (IS_ERR(rq)) {
 747                                        err = PTR_ERR(rq);
 748                                        mutex_unlock(&i915->drm.struct_mutex);
 749                                        break;
 750                                }
 751
 752                                i915_request_get(rq);
 753                                i915_request_add(rq);
 754                                mutex_unlock(&i915->drm.struct_mutex);
 755
 756                                if (!wait_until_running(&h, rq)) {
 757                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 758
 759                                        pr_err("%s: Failed to start request %x, at %x\n",
 760                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 761                                        intel_engine_dump(engine, &p,
 762                                                          "%s\n", engine->name);
 763
 764                                        i915_request_put(rq);
 765                                        err = -EIO;
 766                                        break;
 767                                }
 768
 769                                GEM_BUG_ON(!rq->global_seqno);
 770                                seqno = rq->global_seqno - 1;
 771                        }
 772
 773                        err = i915_reset_engine(engine, NULL);
 774                        if (err) {
 775                                pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 776                                       engine->name, test_name, err);
 777                                break;
 778                        }
 779
 780                        count++;
 781
 782                        if (rq) {
 783                                i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
 784                                i915_request_put(rq);
 785                        }
 786
 787                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
 788                                struct drm_printer p =
 789                                        drm_info_printer(i915->drm.dev);
 790
 791                                pr_err("i915_reset_engine(%s:%s):"
 792                                       " failed to idle after reset\n",
 793                                       engine->name, test_name);
 794                                intel_engine_dump(engine, &p,
 795                                                  "%s\n", engine->name);
 796
 797                                err = -EIO;
 798                                break;
 799                        }
 800                } while (time_before(jiffies, end_time));
 801                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 802                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 803                        engine->name, test_name, count);
 804
 805                reported = i915_reset_engine_count(&i915->gpu_error, engine);
 806                reported -= threads[engine->id].resets;
 807                if (reported != (flags & TEST_ACTIVE ? count : 0)) {
 808                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
 809                               engine->name, test_name, count, reported,
 810                               (flags & TEST_ACTIVE ? count : 0));
 811                        if (!err)
 812                                err = -EINVAL;
 813                }
 814
 815unwind:
 816                for_each_engine(other, i915, tmp) {
 817                        int ret;
 818
 819                        if (!threads[tmp].task)
 820                                continue;
 821
 822                        ret = kthread_stop(threads[tmp].task);
 823                        if (ret) {
 824                                pr_err("kthread for other engine %s failed, err=%d\n",
 825                                       other->name, ret);
 826                                if (!err)
 827                                        err = ret;
 828                        }
 829                        put_task_struct(threads[tmp].task);
 830
 831                        if (other != engine &&
 832                            threads[tmp].resets !=
 833                            i915_reset_engine_count(&i915->gpu_error, other)) {
 834                                pr_err("Innocent engine %s was reset (count=%ld)\n",
 835                                       other->name,
 836                                       i915_reset_engine_count(&i915->gpu_error,
 837                                                               other) -
 838                                       threads[tmp].resets);
 839                                if (!err)
 840                                        err = -EINVAL;
 841                        }
 842                }
 843
 844                if (global != i915_reset_count(&i915->gpu_error)) {
 845                        pr_err("Global reset (count=%ld)!\n",
 846                               i915_reset_count(&i915->gpu_error) - global);
 847                        if (!err)
 848                                err = -EINVAL;
 849                }
 850
 851                if (err)
 852                        break;
 853
 854                err = igt_flush_test(i915, 0);
 855                if (err)
 856                        break;
 857        }
 858
 859        if (i915_terminally_wedged(&i915->gpu_error))
 860                err = -EIO;
 861
 862        if (flags & TEST_ACTIVE) {
 863                mutex_lock(&i915->drm.struct_mutex);
 864                hang_fini(&h);
 865                mutex_unlock(&i915->drm.struct_mutex);
 866        }
 867
 868        return err;
 869}
 870
 871static int igt_reset_engines(void *arg)
 872{
 873        static const struct {
 874                const char *name;
 875                unsigned int flags;
 876        } phases[] = {
 877                { "idle", 0 },
 878                { "active", TEST_ACTIVE },
 879                { "others-idle", TEST_OTHERS },
 880                { "others-active", TEST_OTHERS | TEST_ACTIVE },
 881                {
 882                        "others-priority",
 883                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
 884                },
 885                {
 886                        "self-priority",
 887                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
 888                },
 889                { }
 890        };
 891        struct drm_i915_private *i915 = arg;
 892        typeof(*phases) *p;
 893        int err;
 894
 895        for (p = phases; p->name; p++) {
 896                if (p->flags & TEST_PRIORITY) {
 897                        if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
 898                                continue;
 899                }
 900
 901                err = __igt_reset_engines(arg, p->name, p->flags);
 902                if (err)
 903                        return err;
 904        }
 905
 906        return 0;
 907}
 908
 909static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
 910{
 911        struct i915_gpu_error *error = &rq->i915->gpu_error;
 912        u32 reset_count = i915_reset_count(error);
 913
 914        error->stalled_mask = mask;
 915
 916        /* set_bit() must be after we have setup the backchannel (mask) */
 917        smp_mb__before_atomic();
 918        set_bit(I915_RESET_HANDOFF, &error->flags);
 919
 920        wake_up_all(&error->wait_queue);
 921
 922        return reset_count;
 923}
 924
 925static int igt_reset_wait(void *arg)
 926{
 927        struct drm_i915_private *i915 = arg;
 928        struct i915_request *rq;
 929        unsigned int reset_count;
 930        struct hang h;
 931        long timeout;
 932        int err;
 933
 934        if (!intel_engine_can_store_dword(i915->engine[RCS]))
 935                return 0;
 936
 937        /* Check that we detect a stuck waiter and issue a reset */
 938
 939        global_reset_lock(i915);
 940
 941        mutex_lock(&i915->drm.struct_mutex);
 942        err = hang_init(&h, i915);
 943        if (err)
 944                goto unlock;
 945
 946        rq = hang_create_request(&h, i915->engine[RCS]);
 947        if (IS_ERR(rq)) {
 948                err = PTR_ERR(rq);
 949                goto fini;
 950        }
 951
 952        i915_request_get(rq);
 953        i915_request_add(rq);
 954
 955        if (!wait_until_running(&h, rq)) {
 956                struct drm_printer p = drm_info_printer(i915->drm.dev);
 957
 958                pr_err("%s: Failed to start request %x, at %x\n",
 959                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 960                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 961
 962                i915_gem_set_wedged(i915);
 963
 964                err = -EIO;
 965                goto out_rq;
 966        }
 967
 968        reset_count = fake_hangcheck(rq, ALL_ENGINES);
 969
 970        timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 971        if (timeout < 0) {
 972                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
 973                       timeout);
 974                err = timeout;
 975                goto out_rq;
 976        }
 977
 978        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 979        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 980                pr_err("No GPU reset recorded!\n");
 981                err = -EINVAL;
 982                goto out_rq;
 983        }
 984
 985out_rq:
 986        i915_request_put(rq);
 987fini:
 988        hang_fini(&h);
 989unlock:
 990        mutex_unlock(&i915->drm.struct_mutex);
 991        global_reset_unlock(i915);
 992
 993        if (i915_terminally_wedged(&i915->gpu_error))
 994                return -EIO;
 995
 996        return err;
 997}
 998
 999struct evict_vma {
1000        struct completion completion;
1001        struct i915_vma *vma;
1002};
1003
1004static int evict_vma(void *data)
1005{
1006        struct evict_vma *arg = data;
1007        struct i915_address_space *vm = arg->vma->vm;
1008        struct drm_i915_private *i915 = vm->i915;
1009        struct drm_mm_node evict = arg->vma->node;
1010        int err;
1011
1012        complete(&arg->completion);
1013
1014        mutex_lock(&i915->drm.struct_mutex);
1015        err = i915_gem_evict_for_node(vm, &evict, 0);
1016        mutex_unlock(&i915->drm.struct_mutex);
1017
1018        return err;
1019}
1020
1021static int evict_fence(void *data)
1022{
1023        struct evict_vma *arg = data;
1024        struct drm_i915_private *i915 = arg->vma->vm->i915;
1025        int err;
1026
1027        complete(&arg->completion);
1028
1029        mutex_lock(&i915->drm.struct_mutex);
1030
1031        /* Mark the fence register as dirty to force the mmio update. */
1032        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1033        if (err) {
1034                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1035                goto out_unlock;
1036        }
1037
1038        err = i915_vma_pin_fence(arg->vma);
1039        if (err) {
1040                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1041                goto out_unlock;
1042        }
1043
1044        i915_vma_unpin_fence(arg->vma);
1045
1046out_unlock:
1047        mutex_unlock(&i915->drm.struct_mutex);
1048
1049        return err;
1050}
1051
1052static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1053                                 struct i915_address_space *vm,
1054                                 int (*fn)(void *),
1055                                 unsigned int flags)
1056{
1057        struct drm_i915_gem_object *obj;
1058        struct task_struct *tsk = NULL;
1059        struct i915_request *rq;
1060        struct evict_vma arg;
1061        struct hang h;
1062        int err;
1063
1064        if (!intel_engine_can_store_dword(i915->engine[RCS]))
1065                return 0;
1066
1067        /* Check that we can recover an unbind stuck on a hanging request */
1068
1069        global_reset_lock(i915);
1070
1071        mutex_lock(&i915->drm.struct_mutex);
1072        err = hang_init(&h, i915);
1073        if (err)
1074                goto unlock;
1075
1076        obj = i915_gem_object_create_internal(i915, SZ_1M);
1077        if (IS_ERR(obj)) {
1078                err = PTR_ERR(obj);
1079                goto fini;
1080        }
1081
1082        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1083                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1084                if (err) {
1085                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1086                        goto out_obj;
1087                }
1088        }
1089
1090        arg.vma = i915_vma_instance(obj, vm, NULL);
1091        if (IS_ERR(arg.vma)) {
1092                err = PTR_ERR(arg.vma);
1093                goto out_obj;
1094        }
1095
1096        rq = hang_create_request(&h, i915->engine[RCS]);
1097        if (IS_ERR(rq)) {
1098                err = PTR_ERR(rq);
1099                goto out_obj;
1100        }
1101
1102        err = i915_vma_pin(arg.vma, 0, 0,
1103                           i915_vma_is_ggtt(arg.vma) ?
1104                           PIN_GLOBAL | PIN_MAPPABLE :
1105                           PIN_USER);
1106        if (err) {
1107                i915_request_add(rq);
1108                goto out_obj;
1109        }
1110
1111        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1112                err = i915_vma_pin_fence(arg.vma);
1113                if (err) {
1114                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1115                        i915_vma_unpin(arg.vma);
1116                        i915_request_add(rq);
1117                        goto out_obj;
1118                }
1119        }
1120
1121        err = i915_vma_move_to_active(arg.vma, rq, flags);
1122
1123        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1124                i915_vma_unpin_fence(arg.vma);
1125        i915_vma_unpin(arg.vma);
1126
1127        i915_request_get(rq);
1128        i915_request_add(rq);
1129        if (err)
1130                goto out_rq;
1131
1132        mutex_unlock(&i915->drm.struct_mutex);
1133
1134        if (!wait_until_running(&h, rq)) {
1135                struct drm_printer p = drm_info_printer(i915->drm.dev);
1136
1137                pr_err("%s: Failed to start request %x, at %x\n",
1138                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1139                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1140
1141                i915_gem_set_wedged(i915);
1142                goto out_reset;
1143        }
1144
1145        init_completion(&arg.completion);
1146
1147        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1148        if (IS_ERR(tsk)) {
1149                err = PTR_ERR(tsk);
1150                tsk = NULL;
1151                goto out_reset;
1152        }
1153
1154        wait_for_completion(&arg.completion);
1155
1156        if (wait_for(waitqueue_active(&rq->execute), 10)) {
1157                struct drm_printer p = drm_info_printer(i915->drm.dev);
1158
1159                pr_err("igt/evict_vma kthread did not wait\n");
1160                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1161
1162                i915_gem_set_wedged(i915);
1163                goto out_reset;
1164        }
1165
1166out_reset:
1167        fake_hangcheck(rq, intel_engine_flag(rq->engine));
1168
1169        if (tsk) {
1170                struct igt_wedge_me w;
1171
1172                /* The reset, even indirectly, should take less than 10ms. */
1173                igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1174                        err = kthread_stop(tsk);
1175        }
1176
1177        mutex_lock(&i915->drm.struct_mutex);
1178out_rq:
1179        i915_request_put(rq);
1180out_obj:
1181        i915_gem_object_put(obj);
1182fini:
1183        hang_fini(&h);
1184unlock:
1185        mutex_unlock(&i915->drm.struct_mutex);
1186        global_reset_unlock(i915);
1187
1188        if (i915_terminally_wedged(&i915->gpu_error))
1189                return -EIO;
1190
1191        return err;
1192}
1193
1194static int igt_reset_evict_ggtt(void *arg)
1195{
1196        struct drm_i915_private *i915 = arg;
1197
1198        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1199                                     evict_vma, EXEC_OBJECT_WRITE);
1200}
1201
1202static int igt_reset_evict_ppgtt(void *arg)
1203{
1204        struct drm_i915_private *i915 = arg;
1205        struct i915_gem_context *ctx;
1206        struct drm_file *file;
1207        int err;
1208
1209        file = mock_file(i915);
1210        if (IS_ERR(file))
1211                return PTR_ERR(file);
1212
1213        mutex_lock(&i915->drm.struct_mutex);
1214        ctx = live_context(i915, file);
1215        mutex_unlock(&i915->drm.struct_mutex);
1216        if (IS_ERR(ctx)) {
1217                err = PTR_ERR(ctx);
1218                goto out;
1219        }
1220
1221        err = 0;
1222        if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1223                err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1224                                            evict_vma, EXEC_OBJECT_WRITE);
1225
1226out:
1227        mock_file_free(i915, file);
1228        return err;
1229}
1230
1231static int igt_reset_evict_fence(void *arg)
1232{
1233        struct drm_i915_private *i915 = arg;
1234
1235        return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1236                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1237}
1238
1239static int wait_for_others(struct drm_i915_private *i915,
1240                           struct intel_engine_cs *exclude)
1241{
1242        struct intel_engine_cs *engine;
1243        enum intel_engine_id id;
1244
1245        for_each_engine(engine, i915, id) {
1246                if (engine == exclude)
1247                        continue;
1248
1249                if (!wait_for_idle(engine))
1250                        return -EIO;
1251        }
1252
1253        return 0;
1254}
1255
1256static int igt_reset_queue(void *arg)
1257{
1258        struct drm_i915_private *i915 = arg;
1259        struct intel_engine_cs *engine;
1260        enum intel_engine_id id;
1261        struct hang h;
1262        int err;
1263
1264        /* Check that we replay pending requests following a hang */
1265
1266        global_reset_lock(i915);
1267
1268        mutex_lock(&i915->drm.struct_mutex);
1269        err = hang_init(&h, i915);
1270        if (err)
1271                goto unlock;
1272
1273        for_each_engine(engine, i915, id) {
1274                struct i915_request *prev;
1275                IGT_TIMEOUT(end_time);
1276                unsigned int count;
1277
1278                if (!intel_engine_can_store_dword(engine))
1279                        continue;
1280
1281                prev = hang_create_request(&h, engine);
1282                if (IS_ERR(prev)) {
1283                        err = PTR_ERR(prev);
1284                        goto fini;
1285                }
1286
1287                i915_request_get(prev);
1288                i915_request_add(prev);
1289
1290                count = 0;
1291                do {
1292                        struct i915_request *rq;
1293                        unsigned int reset_count;
1294
1295                        rq = hang_create_request(&h, engine);
1296                        if (IS_ERR(rq)) {
1297                                err = PTR_ERR(rq);
1298                                goto fini;
1299                        }
1300
1301                        i915_request_get(rq);
1302                        i915_request_add(rq);
1303
1304                        /*
1305                         * XXX We don't handle resetting the kernel context
1306                         * very well. If we trigger a device reset twice in
1307                         * quick succession while the kernel context is
1308                         * executing, we may end up skipping the breadcrumb.
1309                         * This is really only a problem for the selftest as
1310                         * normally there is a large interlude between resets
1311                         * (hangcheck), or we focus on resetting just one
1312                         * engine and so avoid repeatedly resetting innocents.
1313                         */
1314                        err = wait_for_others(i915, engine);
1315                        if (err) {
1316                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1317                                       __func__, engine->name);
1318                                i915_request_put(rq);
1319                                i915_request_put(prev);
1320
1321                                GEM_TRACE_DUMP();
1322                                i915_gem_set_wedged(i915);
1323                                goto fini;
1324                        }
1325
1326                        if (!wait_until_running(&h, prev)) {
1327                                struct drm_printer p = drm_info_printer(i915->drm.dev);
1328
1329                                pr_err("%s(%s): Failed to start request %x, at %x\n",
1330                                       __func__, engine->name,
1331                                       prev->fence.seqno, hws_seqno(&h, prev));
1332                                intel_engine_dump(engine, &p,
1333                                                  "%s\n", engine->name);
1334
1335                                i915_request_put(rq);
1336                                i915_request_put(prev);
1337
1338                                i915_gem_set_wedged(i915);
1339
1340                                err = -EIO;
1341                                goto fini;
1342                        }
1343
1344                        reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
1345
1346                        i915_reset(i915, ENGINE_MASK(id), NULL);
1347
1348                        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
1349                                            &i915->gpu_error.flags));
1350
1351                        if (prev->fence.error != -EIO) {
1352                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1353                                       prev->fence.error);
1354                                i915_request_put(rq);
1355                                i915_request_put(prev);
1356                                err = -EINVAL;
1357                                goto fini;
1358                        }
1359
1360                        if (rq->fence.error) {
1361                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1362                                       rq->fence.error);
1363                                i915_request_put(rq);
1364                                i915_request_put(prev);
1365                                err = -EINVAL;
1366                                goto fini;
1367                        }
1368
1369                        if (i915_reset_count(&i915->gpu_error) == reset_count) {
1370                                pr_err("No GPU reset recorded!\n");
1371                                i915_request_put(rq);
1372                                i915_request_put(prev);
1373                                err = -EINVAL;
1374                                goto fini;
1375                        }
1376
1377                        i915_request_put(prev);
1378                        prev = rq;
1379                        count++;
1380                } while (time_before(jiffies, end_time));
1381                pr_info("%s: Completed %d resets\n", engine->name, count);
1382
1383                *h.batch = MI_BATCH_BUFFER_END;
1384                i915_gem_chipset_flush(i915);
1385
1386                i915_request_put(prev);
1387
1388                err = igt_flush_test(i915, I915_WAIT_LOCKED);
1389                if (err)
1390                        break;
1391        }
1392
1393fini:
1394        hang_fini(&h);
1395unlock:
1396        mutex_unlock(&i915->drm.struct_mutex);
1397        global_reset_unlock(i915);
1398
1399        if (i915_terminally_wedged(&i915->gpu_error))
1400                return -EIO;
1401
1402        return err;
1403}
1404
1405static int igt_handle_error(void *arg)
1406{
1407        struct drm_i915_private *i915 = arg;
1408        struct intel_engine_cs *engine = i915->engine[RCS];
1409        struct hang h;
1410        struct i915_request *rq;
1411        struct i915_gpu_state *error;
1412        int err;
1413
1414        /* Check that we can issue a global GPU and engine reset */
1415
1416        if (!intel_has_reset_engine(i915))
1417                return 0;
1418
1419        if (!engine || !intel_engine_can_store_dword(engine))
1420                return 0;
1421
1422        mutex_lock(&i915->drm.struct_mutex);
1423
1424        err = hang_init(&h, i915);
1425        if (err)
1426                goto err_unlock;
1427
1428        rq = hang_create_request(&h, engine);
1429        if (IS_ERR(rq)) {
1430                err = PTR_ERR(rq);
1431                goto err_fini;
1432        }
1433
1434        i915_request_get(rq);
1435        i915_request_add(rq);
1436
1437        if (!wait_until_running(&h, rq)) {
1438                struct drm_printer p = drm_info_printer(i915->drm.dev);
1439
1440                pr_err("%s: Failed to start request %x, at %x\n",
1441                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1442                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1443
1444                i915_gem_set_wedged(i915);
1445
1446                err = -EIO;
1447                goto err_request;
1448        }
1449
1450        mutex_unlock(&i915->drm.struct_mutex);
1451
1452        /* Temporarily disable error capture */
1453        error = xchg(&i915->gpu_error.first_error, (void *)-1);
1454
1455        i915_handle_error(i915, ENGINE_MASK(engine->id), 0, NULL);
1456
1457        xchg(&i915->gpu_error.first_error, error);
1458
1459        mutex_lock(&i915->drm.struct_mutex);
1460
1461        if (rq->fence.error != -EIO) {
1462                pr_err("Guilty request not identified!\n");
1463                err = -EINVAL;
1464                goto err_request;
1465        }
1466
1467err_request:
1468        i915_request_put(rq);
1469err_fini:
1470        hang_fini(&h);
1471err_unlock:
1472        mutex_unlock(&i915->drm.struct_mutex);
1473        return err;
1474}
1475
1476int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1477{
1478        static const struct i915_subtest tests[] = {
1479                SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1480                SUBTEST(igt_hang_sanitycheck),
1481                SUBTEST(igt_reset_idle_engine),
1482                SUBTEST(igt_reset_active_engine),
1483                SUBTEST(igt_reset_engines),
1484                SUBTEST(igt_reset_queue),
1485                SUBTEST(igt_reset_wait),
1486                SUBTEST(igt_reset_evict_ggtt),
1487                SUBTEST(igt_reset_evict_ppgtt),
1488                SUBTEST(igt_reset_evict_fence),
1489                SUBTEST(igt_handle_error),
1490        };
1491        bool saved_hangcheck;
1492        int err;
1493
1494        if (!intel_has_gpu_reset(i915))
1495                return 0;
1496
1497        if (i915_terminally_wedged(&i915->gpu_error))
1498                return -EIO; /* we're long past hope of a successful reset */
1499
1500        intel_runtime_pm_get(i915);
1501        saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1502
1503        err = i915_subtests(tests, i915);
1504
1505        mutex_lock(&i915->drm.struct_mutex);
1506        igt_flush_test(i915, I915_WAIT_LOCKED);
1507        mutex_unlock(&i915->drm.struct_mutex);
1508
1509        i915_modparams.enable_hangcheck = saved_hangcheck;
1510        intel_runtime_pm_put(i915);
1511
1512        return err;
1513}
1514