linux/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
<<
>>
Prefs
   1/*
   2 * Copyright © 2016 Intel Corporation
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 */
  24
  25#include <linux/kthread.h>
  26
  27#include "../i915_selftest.h"
  28
  29#include "mock_context.h"
  30#include "mock_drm.h"
  31
  32struct hang {
  33        struct drm_i915_private *i915;
  34        struct drm_i915_gem_object *hws;
  35        struct drm_i915_gem_object *obj;
  36        struct i915_gem_context *ctx;
  37        u32 *seqno;
  38        u32 *batch;
  39};
  40
  41static int hang_init(struct hang *h, struct drm_i915_private *i915)
  42{
  43        void *vaddr;
  44        int err;
  45
  46        memset(h, 0, sizeof(*h));
  47        h->i915 = i915;
  48
  49        h->ctx = kernel_context(i915);
  50        if (IS_ERR(h->ctx))
  51                return PTR_ERR(h->ctx);
  52
  53        h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
  54        if (IS_ERR(h->hws)) {
  55                err = PTR_ERR(h->hws);
  56                goto err_ctx;
  57        }
  58
  59        h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
  60        if (IS_ERR(h->obj)) {
  61                err = PTR_ERR(h->obj);
  62                goto err_hws;
  63        }
  64
  65        i915_gem_object_set_cache_level(h->hws, I915_CACHE_LLC);
  66        vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
  67        if (IS_ERR(vaddr)) {
  68                err = PTR_ERR(vaddr);
  69                goto err_obj;
  70        }
  71        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  72
  73        vaddr = i915_gem_object_pin_map(h->obj,
  74                                        HAS_LLC(i915) ? I915_MAP_WB : I915_MAP_WC);
  75        if (IS_ERR(vaddr)) {
  76                err = PTR_ERR(vaddr);
  77                goto err_unpin_hws;
  78        }
  79        h->batch = vaddr;
  80
  81        return 0;
  82
  83err_unpin_hws:
  84        i915_gem_object_unpin_map(h->hws);
  85err_obj:
  86        i915_gem_object_put(h->obj);
  87err_hws:
  88        i915_gem_object_put(h->hws);
  89err_ctx:
  90        kernel_context_close(h->ctx);
  91        return err;
  92}
  93
  94static u64 hws_address(const struct i915_vma *hws,
  95                       const struct i915_request *rq)
  96{
  97        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  98}
  99
 100static int emit_recurse_batch(struct hang *h,
 101                              struct i915_request *rq)
 102{
 103        struct drm_i915_private *i915 = h->i915;
 104        struct i915_address_space *vm = rq->ctx->ppgtt ? &rq->ctx->ppgtt->base : &i915->ggtt.base;
 105        struct i915_vma *hws, *vma;
 106        unsigned int flags;
 107        u32 *batch;
 108        int err;
 109
 110        vma = i915_vma_instance(h->obj, vm, NULL);
 111        if (IS_ERR(vma))
 112                return PTR_ERR(vma);
 113
 114        hws = i915_vma_instance(h->hws, vm, NULL);
 115        if (IS_ERR(hws))
 116                return PTR_ERR(hws);
 117
 118        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 119        if (err)
 120                return err;
 121
 122        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 123        if (err)
 124                goto unpin_vma;
 125
 126        i915_vma_move_to_active(vma, rq, 0);
 127        if (!i915_gem_object_has_active_reference(vma->obj)) {
 128                i915_gem_object_get(vma->obj);
 129                i915_gem_object_set_active_reference(vma->obj);
 130        }
 131
 132        i915_vma_move_to_active(hws, rq, 0);
 133        if (!i915_gem_object_has_active_reference(hws->obj)) {
 134                i915_gem_object_get(hws->obj);
 135                i915_gem_object_set_active_reference(hws->obj);
 136        }
 137
 138        batch = h->batch;
 139        if (INTEL_GEN(i915) >= 8) {
 140                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 141                *batch++ = lower_32_bits(hws_address(hws, rq));
 142                *batch++ = upper_32_bits(hws_address(hws, rq));
 143                *batch++ = rq->fence.seqno;
 144                *batch++ = MI_ARB_CHECK;
 145
 146                memset(batch, 0, 1024);
 147                batch += 1024 / sizeof(*batch);
 148
 149                *batch++ = MI_ARB_CHECK;
 150                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 151                *batch++ = lower_32_bits(vma->node.start);
 152                *batch++ = upper_32_bits(vma->node.start);
 153        } else if (INTEL_GEN(i915) >= 6) {
 154                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 155                *batch++ = 0;
 156                *batch++ = lower_32_bits(hws_address(hws, rq));
 157                *batch++ = rq->fence.seqno;
 158                *batch++ = MI_ARB_CHECK;
 159
 160                memset(batch, 0, 1024);
 161                batch += 1024 / sizeof(*batch);
 162
 163                *batch++ = MI_ARB_CHECK;
 164                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 165                *batch++ = lower_32_bits(vma->node.start);
 166        } else if (INTEL_GEN(i915) >= 4) {
 167                *batch++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22;
 168                *batch++ = 0;
 169                *batch++ = lower_32_bits(hws_address(hws, rq));
 170                *batch++ = rq->fence.seqno;
 171                *batch++ = MI_ARB_CHECK;
 172
 173                memset(batch, 0, 1024);
 174                batch += 1024 / sizeof(*batch);
 175
 176                *batch++ = MI_ARB_CHECK;
 177                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 178                *batch++ = lower_32_bits(vma->node.start);
 179        } else {
 180                *batch++ = MI_STORE_DWORD_IMM;
 181                *batch++ = lower_32_bits(hws_address(hws, rq));
 182                *batch++ = rq->fence.seqno;
 183                *batch++ = MI_ARB_CHECK;
 184
 185                memset(batch, 0, 1024);
 186                batch += 1024 / sizeof(*batch);
 187
 188                *batch++ = MI_ARB_CHECK;
 189                *batch++ = MI_BATCH_BUFFER_START | 2 << 6 | 1;
 190                *batch++ = lower_32_bits(vma->node.start);
 191        }
 192        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 193        i915_gem_chipset_flush(h->i915);
 194
 195        flags = 0;
 196        if (INTEL_GEN(vm->i915) <= 5)
 197                flags |= I915_DISPATCH_SECURE;
 198
 199        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 200
 201        i915_vma_unpin(hws);
 202unpin_vma:
 203        i915_vma_unpin(vma);
 204        return err;
 205}
 206
 207static struct i915_request *
 208hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 209{
 210        struct i915_request *rq;
 211        int err;
 212
 213        if (i915_gem_object_is_active(h->obj)) {
 214                struct drm_i915_gem_object *obj;
 215                void *vaddr;
 216
 217                obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
 218                if (IS_ERR(obj))
 219                        return ERR_CAST(obj);
 220
 221                vaddr = i915_gem_object_pin_map(obj,
 222                                                HAS_LLC(h->i915) ? I915_MAP_WB : I915_MAP_WC);
 223                if (IS_ERR(vaddr)) {
 224                        i915_gem_object_put(obj);
 225                        return ERR_CAST(vaddr);
 226                }
 227
 228                i915_gem_object_unpin_map(h->obj);
 229                i915_gem_object_put(h->obj);
 230
 231                h->obj = obj;
 232                h->batch = vaddr;
 233        }
 234
 235        rq = i915_request_alloc(engine, h->ctx);
 236        if (IS_ERR(rq))
 237                return rq;
 238
 239        err = emit_recurse_batch(h, rq);
 240        if (err) {
 241                __i915_request_add(rq, false);
 242                return ERR_PTR(err);
 243        }
 244
 245        return rq;
 246}
 247
 248static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 249{
 250        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 251}
 252
 253struct wedge_me {
 254        struct delayed_work work;
 255        struct drm_i915_private *i915;
 256        const void *symbol;
 257};
 258
 259static void wedge_me(struct work_struct *work)
 260{
 261        struct wedge_me *w = container_of(work, typeof(*w), work.work);
 262
 263        pr_err("%pS timed out, cancelling all further testing.\n",
 264               w->symbol);
 265        i915_gem_set_wedged(w->i915);
 266}
 267
 268static void __init_wedge(struct wedge_me *w,
 269                         struct drm_i915_private *i915,
 270                         long timeout,
 271                         const void *symbol)
 272{
 273        w->i915 = i915;
 274        w->symbol = symbol;
 275
 276        INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
 277        schedule_delayed_work(&w->work, timeout);
 278}
 279
 280static void __fini_wedge(struct wedge_me *w)
 281{
 282        cancel_delayed_work_sync(&w->work);
 283        destroy_delayed_work_on_stack(&w->work);
 284        w->i915 = NULL;
 285}
 286
 287#define wedge_on_timeout(W, DEV, TIMEOUT)                               \
 288        for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
 289             (W)->i915;                                                 \
 290             __fini_wedge((W)))
 291
 292static noinline int
 293flush_test(struct drm_i915_private *i915, unsigned int flags)
 294{
 295        struct wedge_me w;
 296
 297        cond_resched();
 298
 299        wedge_on_timeout(&w, i915, HZ)
 300                i915_gem_wait_for_idle(i915, flags);
 301
 302        return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 303}
 304
 305static void hang_fini(struct hang *h)
 306{
 307        *h->batch = MI_BATCH_BUFFER_END;
 308        i915_gem_chipset_flush(h->i915);
 309
 310        i915_gem_object_unpin_map(h->obj);
 311        i915_gem_object_put(h->obj);
 312
 313        i915_gem_object_unpin_map(h->hws);
 314        i915_gem_object_put(h->hws);
 315
 316        kernel_context_close(h->ctx);
 317
 318        flush_test(h->i915, I915_WAIT_LOCKED);
 319}
 320
 321static bool wait_for_hang(struct hang *h, struct i915_request *rq)
 322{
 323        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 324                                               rq->fence.seqno),
 325                             10) &&
 326                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 327                                            rq->fence.seqno),
 328                          1000));
 329}
 330
 331static int igt_hang_sanitycheck(void *arg)
 332{
 333        struct drm_i915_private *i915 = arg;
 334        struct i915_request *rq;
 335        struct intel_engine_cs *engine;
 336        enum intel_engine_id id;
 337        struct hang h;
 338        int err;
 339
 340        /* Basic check that we can execute our hanging batch */
 341
 342        mutex_lock(&i915->drm.struct_mutex);
 343        err = hang_init(&h, i915);
 344        if (err)
 345                goto unlock;
 346
 347        for_each_engine(engine, i915, id) {
 348                long timeout;
 349
 350                if (!intel_engine_can_store_dword(engine))
 351                        continue;
 352
 353                rq = hang_create_request(&h, engine);
 354                if (IS_ERR(rq)) {
 355                        err = PTR_ERR(rq);
 356                        pr_err("Failed to create request for %s, err=%d\n",
 357                               engine->name, err);
 358                        goto fini;
 359                }
 360
 361                i915_request_get(rq);
 362
 363                *h.batch = MI_BATCH_BUFFER_END;
 364                i915_gem_chipset_flush(i915);
 365
 366                __i915_request_add(rq, true);
 367
 368                timeout = i915_request_wait(rq,
 369                                            I915_WAIT_LOCKED,
 370                                            MAX_SCHEDULE_TIMEOUT);
 371                i915_request_put(rq);
 372
 373                if (timeout < 0) {
 374                        err = timeout;
 375                        pr_err("Wait for request failed on %s, err=%d\n",
 376                               engine->name, err);
 377                        goto fini;
 378                }
 379        }
 380
 381fini:
 382        hang_fini(&h);
 383unlock:
 384        mutex_unlock(&i915->drm.struct_mutex);
 385        return err;
 386}
 387
 388static void global_reset_lock(struct drm_i915_private *i915)
 389{
 390        struct intel_engine_cs *engine;
 391        enum intel_engine_id id;
 392
 393        pr_debug("%s: current gpu_error=%08lx\n",
 394                 __func__, i915->gpu_error.flags);
 395
 396        while (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags))
 397                wait_event(i915->gpu_error.reset_queue,
 398                           !test_bit(I915_RESET_BACKOFF,
 399                                     &i915->gpu_error.flags));
 400
 401        for_each_engine(engine, i915, id) {
 402                while (test_and_set_bit(I915_RESET_ENGINE + id,
 403                                        &i915->gpu_error.flags))
 404                        wait_on_bit(&i915->gpu_error.flags,
 405                                    I915_RESET_ENGINE + id,
 406                                    TASK_UNINTERRUPTIBLE);
 407        }
 408}
 409
 410static void global_reset_unlock(struct drm_i915_private *i915)
 411{
 412        struct intel_engine_cs *engine;
 413        enum intel_engine_id id;
 414
 415        for_each_engine(engine, i915, id)
 416                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 417
 418        clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 419        wake_up_all(&i915->gpu_error.reset_queue);
 420}
 421
 422static int igt_global_reset(void *arg)
 423{
 424        struct drm_i915_private *i915 = arg;
 425        unsigned int reset_count;
 426        int err = 0;
 427
 428        /* Check that we can issue a global GPU reset */
 429
 430        global_reset_lock(i915);
 431        set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 432
 433        mutex_lock(&i915->drm.struct_mutex);
 434        reset_count = i915_reset_count(&i915->gpu_error);
 435
 436        i915_reset(i915, I915_RESET_QUIET);
 437
 438        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 439                pr_err("No GPU reset recorded!\n");
 440                err = -EINVAL;
 441        }
 442        mutex_unlock(&i915->drm.struct_mutex);
 443
 444        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 445        global_reset_unlock(i915);
 446
 447        if (i915_terminally_wedged(&i915->gpu_error))
 448                err = -EIO;
 449
 450        return err;
 451}
 452
 453static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 454{
 455        struct intel_engine_cs *engine;
 456        enum intel_engine_id id;
 457        struct hang h;
 458        int err = 0;
 459
 460        /* Check that we can issue an engine reset on an idle engine (no-op) */
 461
 462        if (!intel_has_reset_engine(i915))
 463                return 0;
 464
 465        if (active) {
 466                mutex_lock(&i915->drm.struct_mutex);
 467                err = hang_init(&h, i915);
 468                mutex_unlock(&i915->drm.struct_mutex);
 469                if (err)
 470                        return err;
 471        }
 472
 473        for_each_engine(engine, i915, id) {
 474                unsigned int reset_count, reset_engine_count;
 475                IGT_TIMEOUT(end_time);
 476
 477                if (active && !intel_engine_can_store_dword(engine))
 478                        continue;
 479
 480                reset_count = i915_reset_count(&i915->gpu_error);
 481                reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
 482                                                             engine);
 483
 484                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 485                do {
 486                        if (active) {
 487                                struct i915_request *rq;
 488
 489                                mutex_lock(&i915->drm.struct_mutex);
 490                                rq = hang_create_request(&h, engine);
 491                                if (IS_ERR(rq)) {
 492                                        err = PTR_ERR(rq);
 493                                        mutex_unlock(&i915->drm.struct_mutex);
 494                                        break;
 495                                }
 496
 497                                i915_request_get(rq);
 498                                __i915_request_add(rq, true);
 499                                mutex_unlock(&i915->drm.struct_mutex);
 500
 501                                if (!wait_for_hang(&h, rq)) {
 502                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 503
 504                                        pr_err("%s: Failed to start request %x, at %x\n",
 505                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 506                                        intel_engine_dump(engine, &p,
 507                                                          "%s\n", engine->name);
 508
 509                                        i915_request_put(rq);
 510                                        err = -EIO;
 511                                        break;
 512                                }
 513
 514                                i915_request_put(rq);
 515                        }
 516
 517                        engine->hangcheck.stalled = true;
 518                        engine->hangcheck.seqno =
 519                                intel_engine_get_seqno(engine);
 520
 521                        err = i915_reset_engine(engine, I915_RESET_QUIET);
 522                        if (err) {
 523                                pr_err("i915_reset_engine failed\n");
 524                                break;
 525                        }
 526
 527                        if (i915_reset_count(&i915->gpu_error) != reset_count) {
 528                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 529                                err = -EINVAL;
 530                                break;
 531                        }
 532
 533                        reset_engine_count += active;
 534                        if (i915_reset_engine_count(&i915->gpu_error, engine) !=
 535                            reset_engine_count) {
 536                                pr_err("%s engine reset %srecorded!\n",
 537                                       engine->name, active ? "not " : "");
 538                                err = -EINVAL;
 539                                break;
 540                        }
 541
 542                        engine->hangcheck.stalled = false;
 543                } while (time_before(jiffies, end_time));
 544                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 545
 546                if (err)
 547                        break;
 548
 549                err = flush_test(i915, 0);
 550                if (err)
 551                        break;
 552        }
 553
 554        if (i915_terminally_wedged(&i915->gpu_error))
 555                err = -EIO;
 556
 557        if (active) {
 558                mutex_lock(&i915->drm.struct_mutex);
 559                hang_fini(&h);
 560                mutex_unlock(&i915->drm.struct_mutex);
 561        }
 562
 563        return err;
 564}
 565
 566static int igt_reset_idle_engine(void *arg)
 567{
 568        return __igt_reset_engine(arg, false);
 569}
 570
 571static int igt_reset_active_engine(void *arg)
 572{
 573        return __igt_reset_engine(arg, true);
 574}
 575
 576static int active_engine(void *data)
 577{
 578        struct intel_engine_cs *engine = data;
 579        struct i915_request *rq[2] = {};
 580        struct i915_gem_context *ctx[2];
 581        struct drm_file *file;
 582        unsigned long count = 0;
 583        int err = 0;
 584
 585        file = mock_file(engine->i915);
 586        if (IS_ERR(file))
 587                return PTR_ERR(file);
 588
 589        mutex_lock(&engine->i915->drm.struct_mutex);
 590        ctx[0] = live_context(engine->i915, file);
 591        mutex_unlock(&engine->i915->drm.struct_mutex);
 592        if (IS_ERR(ctx[0])) {
 593                err = PTR_ERR(ctx[0]);
 594                goto err_file;
 595        }
 596
 597        mutex_lock(&engine->i915->drm.struct_mutex);
 598        ctx[1] = live_context(engine->i915, file);
 599        mutex_unlock(&engine->i915->drm.struct_mutex);
 600        if (IS_ERR(ctx[1])) {
 601                err = PTR_ERR(ctx[1]);
 602                i915_gem_context_put(ctx[0]);
 603                goto err_file;
 604        }
 605
 606        while (!kthread_should_stop()) {
 607                unsigned int idx = count++ & 1;
 608                struct i915_request *old = rq[idx];
 609                struct i915_request *new;
 610
 611                mutex_lock(&engine->i915->drm.struct_mutex);
 612                new = i915_request_alloc(engine, ctx[idx]);
 613                if (IS_ERR(new)) {
 614                        mutex_unlock(&engine->i915->drm.struct_mutex);
 615                        err = PTR_ERR(new);
 616                        break;
 617                }
 618
 619                rq[idx] = i915_request_get(new);
 620                i915_request_add(new);
 621                mutex_unlock(&engine->i915->drm.struct_mutex);
 622
 623                if (old) {
 624                        i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
 625                        i915_request_put(old);
 626                }
 627        }
 628
 629        for (count = 0; count < ARRAY_SIZE(rq); count++)
 630                i915_request_put(rq[count]);
 631
 632err_file:
 633        mock_file_free(engine->i915, file);
 634        return err;
 635}
 636
 637static int __igt_reset_engine_others(struct drm_i915_private *i915,
 638                                     bool active)
 639{
 640        struct intel_engine_cs *engine, *other;
 641        enum intel_engine_id id, tmp;
 642        struct hang h;
 643        int err = 0;
 644
 645        /* Check that issuing a reset on one engine does not interfere
 646         * with any other engine.
 647         */
 648
 649        if (!intel_has_reset_engine(i915))
 650                return 0;
 651
 652        if (active) {
 653                mutex_lock(&i915->drm.struct_mutex);
 654                err = hang_init(&h, i915);
 655                mutex_unlock(&i915->drm.struct_mutex);
 656                if (err)
 657                        return err;
 658        }
 659
 660        for_each_engine(engine, i915, id) {
 661                struct task_struct *threads[I915_NUM_ENGINES] = {};
 662                unsigned long resets[I915_NUM_ENGINES];
 663                unsigned long global = i915_reset_count(&i915->gpu_error);
 664                unsigned long count = 0;
 665                IGT_TIMEOUT(end_time);
 666
 667                if (active && !intel_engine_can_store_dword(engine))
 668                        continue;
 669
 670                memset(threads, 0, sizeof(threads));
 671                for_each_engine(other, i915, tmp) {
 672                        struct task_struct *tsk;
 673
 674                        resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
 675                                                              other);
 676
 677                        if (other == engine)
 678                                continue;
 679
 680                        tsk = kthread_run(active_engine, other,
 681                                          "igt/%s", other->name);
 682                        if (IS_ERR(tsk)) {
 683                                err = PTR_ERR(tsk);
 684                                goto unwind;
 685                        }
 686
 687                        threads[tmp] = tsk;
 688                        get_task_struct(tsk);
 689                }
 690
 691                set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 692                do {
 693                        if (active) {
 694                                struct i915_request *rq;
 695
 696                                mutex_lock(&i915->drm.struct_mutex);
 697                                rq = hang_create_request(&h, engine);
 698                                if (IS_ERR(rq)) {
 699                                        err = PTR_ERR(rq);
 700                                        mutex_unlock(&i915->drm.struct_mutex);
 701                                        break;
 702                                }
 703
 704                                i915_request_get(rq);
 705                                __i915_request_add(rq, true);
 706                                mutex_unlock(&i915->drm.struct_mutex);
 707
 708                                if (!wait_for_hang(&h, rq)) {
 709                                        struct drm_printer p = drm_info_printer(i915->drm.dev);
 710
 711                                        pr_err("%s: Failed to start request %x, at %x\n",
 712                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 713                                        intel_engine_dump(engine, &p,
 714                                                          "%s\n", engine->name);
 715
 716                                        i915_request_put(rq);
 717                                        err = -EIO;
 718                                        break;
 719                                }
 720
 721                                i915_request_put(rq);
 722                        }
 723
 724                        engine->hangcheck.stalled = true;
 725                        engine->hangcheck.seqno =
 726                                intel_engine_get_seqno(engine);
 727
 728                        err = i915_reset_engine(engine, I915_RESET_QUIET);
 729                        if (err) {
 730                                pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
 731                                       engine->name, active ? "active" : "idle", err);
 732                                break;
 733                        }
 734
 735                        engine->hangcheck.stalled = false;
 736                        count++;
 737                } while (time_before(jiffies, end_time));
 738                clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 739                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
 740                        engine->name, active ? "active" : "idle", count);
 741
 742                if (i915_reset_engine_count(&i915->gpu_error, engine) -
 743                    resets[engine->id] != (active ? count : 0)) {
 744                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
 745                               engine->name, active ? "active" : "idle", count,
 746                               i915_reset_engine_count(&i915->gpu_error,
 747                                                       engine) - resets[engine->id]);
 748                        if (!err)
 749                                err = -EINVAL;
 750                }
 751
 752unwind:
 753                for_each_engine(other, i915, tmp) {
 754                        int ret;
 755
 756                        if (!threads[tmp])
 757                                continue;
 758
 759                        ret = kthread_stop(threads[tmp]);
 760                        if (ret) {
 761                                pr_err("kthread for other engine %s failed, err=%d\n",
 762                                       other->name, ret);
 763                                if (!err)
 764                                        err = ret;
 765                        }
 766                        put_task_struct(threads[tmp]);
 767
 768                        if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
 769                                                                   other)) {
 770                                pr_err("Innocent engine %s was reset (count=%ld)\n",
 771                                       other->name,
 772                                       i915_reset_engine_count(&i915->gpu_error,
 773                                                               other) - resets[tmp]);
 774                                if (!err)
 775                                        err = -EINVAL;
 776                        }
 777                }
 778
 779                if (global != i915_reset_count(&i915->gpu_error)) {
 780                        pr_err("Global reset (count=%ld)!\n",
 781                               i915_reset_count(&i915->gpu_error) - global);
 782                        if (!err)
 783                                err = -EINVAL;
 784                }
 785
 786                if (err)
 787                        break;
 788
 789                err = flush_test(i915, 0);
 790                if (err)
 791                        break;
 792        }
 793
 794        if (i915_terminally_wedged(&i915->gpu_error))
 795                err = -EIO;
 796
 797        if (active) {
 798                mutex_lock(&i915->drm.struct_mutex);
 799                hang_fini(&h);
 800                mutex_unlock(&i915->drm.struct_mutex);
 801        }
 802
 803        return err;
 804}
 805
 806static int igt_reset_idle_engine_others(void *arg)
 807{
 808        return __igt_reset_engine_others(arg, false);
 809}
 810
 811static int igt_reset_active_engine_others(void *arg)
 812{
 813        return __igt_reset_engine_others(arg, true);
 814}
 815
 816static u32 fake_hangcheck(struct i915_request *rq)
 817{
 818        u32 reset_count;
 819
 820        rq->engine->hangcheck.stalled = true;
 821        rq->engine->hangcheck.seqno = intel_engine_get_seqno(rq->engine);
 822
 823        reset_count = i915_reset_count(&rq->i915->gpu_error);
 824
 825        set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 826        wake_up_all(&rq->i915->gpu_error.wait_queue);
 827
 828        return reset_count;
 829}
 830
 831static int igt_wait_reset(void *arg)
 832{
 833        struct drm_i915_private *i915 = arg;
 834        struct i915_request *rq;
 835        unsigned int reset_count;
 836        struct hang h;
 837        long timeout;
 838        int err;
 839
 840        if (!intel_engine_can_store_dword(i915->engine[RCS]))
 841                return 0;
 842
 843        /* Check that we detect a stuck waiter and issue a reset */
 844
 845        global_reset_lock(i915);
 846
 847        mutex_lock(&i915->drm.struct_mutex);
 848        err = hang_init(&h, i915);
 849        if (err)
 850                goto unlock;
 851
 852        rq = hang_create_request(&h, i915->engine[RCS]);
 853        if (IS_ERR(rq)) {
 854                err = PTR_ERR(rq);
 855                goto fini;
 856        }
 857
 858        i915_request_get(rq);
 859        __i915_request_add(rq, true);
 860
 861        if (!wait_for_hang(&h, rq)) {
 862                struct drm_printer p = drm_info_printer(i915->drm.dev);
 863
 864                pr_err("%s: Failed to start request %x, at %x\n",
 865                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
 866                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
 867
 868                i915_reset(i915, 0);
 869                i915_gem_set_wedged(i915);
 870
 871                err = -EIO;
 872                goto out_rq;
 873        }
 874
 875        reset_count = fake_hangcheck(rq);
 876
 877        timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 878        if (timeout < 0) {
 879                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
 880                       timeout);
 881                err = timeout;
 882                goto out_rq;
 883        }
 884
 885        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 886        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 887                pr_err("No GPU reset recorded!\n");
 888                err = -EINVAL;
 889                goto out_rq;
 890        }
 891
 892out_rq:
 893        i915_request_put(rq);
 894fini:
 895        hang_fini(&h);
 896unlock:
 897        mutex_unlock(&i915->drm.struct_mutex);
 898        global_reset_unlock(i915);
 899
 900        if (i915_terminally_wedged(&i915->gpu_error))
 901                return -EIO;
 902
 903        return err;
 904}
 905
 906static int igt_reset_queue(void *arg)
 907{
 908        struct drm_i915_private *i915 = arg;
 909        struct intel_engine_cs *engine;
 910        enum intel_engine_id id;
 911        struct hang h;
 912        int err;
 913
 914        /* Check that we replay pending requests following a hang */
 915
 916        global_reset_lock(i915);
 917
 918        mutex_lock(&i915->drm.struct_mutex);
 919        err = hang_init(&h, i915);
 920        if (err)
 921                goto unlock;
 922
 923        for_each_engine(engine, i915, id) {
 924                struct i915_request *prev;
 925                IGT_TIMEOUT(end_time);
 926                unsigned int count;
 927
 928                if (!intel_engine_can_store_dword(engine))
 929                        continue;
 930
 931                prev = hang_create_request(&h, engine);
 932                if (IS_ERR(prev)) {
 933                        err = PTR_ERR(prev);
 934                        goto fini;
 935                }
 936
 937                i915_request_get(prev);
 938                __i915_request_add(prev, true);
 939
 940                count = 0;
 941                do {
 942                        struct i915_request *rq;
 943                        unsigned int reset_count;
 944
 945                        rq = hang_create_request(&h, engine);
 946                        if (IS_ERR(rq)) {
 947                                err = PTR_ERR(rq);
 948                                goto fini;
 949                        }
 950
 951                        i915_request_get(rq);
 952                        __i915_request_add(rq, true);
 953
 954                        if (!wait_for_hang(&h, prev)) {
 955                                struct drm_printer p = drm_info_printer(i915->drm.dev);
 956
 957                                pr_err("%s: Failed to start request %x, at %x\n",
 958                                       __func__, prev->fence.seqno, hws_seqno(&h, prev));
 959                                intel_engine_dump(prev->engine, &p,
 960                                                  "%s\n", prev->engine->name);
 961
 962                                i915_request_put(rq);
 963                                i915_request_put(prev);
 964
 965                                i915_reset(i915, 0);
 966                                i915_gem_set_wedged(i915);
 967
 968                                err = -EIO;
 969                                goto fini;
 970                        }
 971
 972                        reset_count = fake_hangcheck(prev);
 973
 974                        i915_reset(i915, I915_RESET_QUIET);
 975
 976                        GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 977                                            &i915->gpu_error.flags));
 978
 979                        if (prev->fence.error != -EIO) {
 980                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 981                                       prev->fence.error);
 982                                i915_request_put(rq);
 983                                i915_request_put(prev);
 984                                err = -EINVAL;
 985                                goto fini;
 986                        }
 987
 988                        if (rq->fence.error) {
 989                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
 990                                       rq->fence.error);
 991                                i915_request_put(rq);
 992                                i915_request_put(prev);
 993                                err = -EINVAL;
 994                                goto fini;
 995                        }
 996
 997                        if (i915_reset_count(&i915->gpu_error) == reset_count) {
 998                                pr_err("No GPU reset recorded!\n");
 999                                i915_request_put(rq);
1000                                i915_request_put(prev);
1001                                err = -EINVAL;
1002                                goto fini;
1003                        }
1004
1005                        i915_request_put(prev);
1006                        prev = rq;
1007                        count++;
1008                } while (time_before(jiffies, end_time));
1009                pr_info("%s: Completed %d resets\n", engine->name, count);
1010
1011                *h.batch = MI_BATCH_BUFFER_END;
1012                i915_gem_chipset_flush(i915);
1013
1014                i915_request_put(prev);
1015
1016                err = flush_test(i915, I915_WAIT_LOCKED);
1017                if (err)
1018                        break;
1019        }
1020
1021fini:
1022        hang_fini(&h);
1023unlock:
1024        mutex_unlock(&i915->drm.struct_mutex);
1025        global_reset_unlock(i915);
1026
1027        if (i915_terminally_wedged(&i915->gpu_error))
1028                return -EIO;
1029
1030        return err;
1031}
1032
1033static int igt_handle_error(void *arg)
1034{
1035        struct drm_i915_private *i915 = arg;
1036        struct intel_engine_cs *engine = i915->engine[RCS];
1037        struct hang h;
1038        struct i915_request *rq;
1039        struct i915_gpu_state *error;
1040        int err;
1041
1042        /* Check that we can issue a global GPU and engine reset */
1043
1044        if (!intel_has_reset_engine(i915))
1045                return 0;
1046
1047        if (!intel_engine_can_store_dword(i915->engine[RCS]))
1048                return 0;
1049
1050        mutex_lock(&i915->drm.struct_mutex);
1051
1052        err = hang_init(&h, i915);
1053        if (err)
1054                goto err_unlock;
1055
1056        rq = hang_create_request(&h, engine);
1057        if (IS_ERR(rq)) {
1058                err = PTR_ERR(rq);
1059                goto err_fini;
1060        }
1061
1062        i915_request_get(rq);
1063        __i915_request_add(rq, true);
1064
1065        if (!wait_for_hang(&h, rq)) {
1066                struct drm_printer p = drm_info_printer(i915->drm.dev);
1067
1068                pr_err("%s: Failed to start request %x, at %x\n",
1069                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1070                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1071
1072                i915_reset(i915, 0);
1073                i915_gem_set_wedged(i915);
1074
1075                err = -EIO;
1076                goto err_request;
1077        }
1078
1079        mutex_unlock(&i915->drm.struct_mutex);
1080
1081        /* Temporarily disable error capture */
1082        error = xchg(&i915->gpu_error.first_error, (void *)-1);
1083
1084        engine->hangcheck.stalled = true;
1085        engine->hangcheck.seqno = intel_engine_get_seqno(engine);
1086
1087        i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
1088
1089        xchg(&i915->gpu_error.first_error, error);
1090
1091        mutex_lock(&i915->drm.struct_mutex);
1092
1093        if (rq->fence.error != -EIO) {
1094                pr_err("Guilty request not identified!\n");
1095                err = -EINVAL;
1096                goto err_request;
1097        }
1098
1099err_request:
1100        i915_request_put(rq);
1101err_fini:
1102        hang_fini(&h);
1103err_unlock:
1104        mutex_unlock(&i915->drm.struct_mutex);
1105        return err;
1106}
1107
1108int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1109{
1110        static const struct i915_subtest tests[] = {
1111                SUBTEST(igt_global_reset), /* attempt to recover GPU first */
1112                SUBTEST(igt_hang_sanitycheck),
1113                SUBTEST(igt_reset_idle_engine),
1114                SUBTEST(igt_reset_active_engine),
1115                SUBTEST(igt_reset_idle_engine_others),
1116                SUBTEST(igt_reset_active_engine_others),
1117                SUBTEST(igt_wait_reset),
1118                SUBTEST(igt_reset_queue),
1119                SUBTEST(igt_handle_error),
1120        };
1121        bool saved_hangcheck;
1122        int err;
1123
1124        if (!intel_has_gpu_reset(i915))
1125                return 0;
1126
1127        intel_runtime_pm_get(i915);
1128        saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1129
1130        err = i915_subtests(tests, i915);
1131
1132        i915_modparams.enable_hangcheck = saved_hangcheck;
1133        intel_runtime_pm_put(i915);
1134
1135        return err;
1136}
1137