linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2016 Intel Corporation
   4 */
   5
   6#include <linux/kthread.h>
   7
   8#include "gem/i915_gem_context.h"
   9
  10#include "intel_gt.h"
  11#include "intel_engine_heartbeat.h"
  12#include "intel_engine_pm.h"
  13#include "selftest_engine_heartbeat.h"
  14
  15#include "i915_selftest.h"
  16#include "selftests/i915_random.h"
  17#include "selftests/igt_flush_test.h"
  18#include "selftests/igt_reset.h"
  19#include "selftests/igt_atomic.h"
  20
  21#include "selftests/mock_drm.h"
  22
  23#include "gem/selftests/mock_context.h"
  24#include "gem/selftests/igt_gem_utils.h"
  25
  26#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  27
  28struct hang {
  29        struct intel_gt *gt;
  30        struct drm_i915_gem_object *hws;
  31        struct drm_i915_gem_object *obj;
  32        struct i915_gem_context *ctx;
  33        u32 *seqno;
  34        u32 *batch;
  35};
  36
  37static int hang_init(struct hang *h, struct intel_gt *gt)
  38{
  39        void *vaddr;
  40        int err;
  41
  42        memset(h, 0, sizeof(*h));
  43        h->gt = gt;
  44
  45        h->ctx = kernel_context(gt->i915);
  46        if (IS_ERR(h->ctx))
  47                return PTR_ERR(h->ctx);
  48
  49        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  50
  51        h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  52        if (IS_ERR(h->hws)) {
  53                err = PTR_ERR(h->hws);
  54                goto err_ctx;
  55        }
  56
  57        h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  58        if (IS_ERR(h->obj)) {
  59                err = PTR_ERR(h->obj);
  60                goto err_hws;
  61        }
  62
  63        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  64        vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
  65        if (IS_ERR(vaddr)) {
  66                err = PTR_ERR(vaddr);
  67                goto err_obj;
  68        }
  69        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  70
  71        vaddr = i915_gem_object_pin_map_unlocked(h->obj,
  72                                                 i915_coherent_map_type(gt->i915));
  73        if (IS_ERR(vaddr)) {
  74                err = PTR_ERR(vaddr);
  75                goto err_unpin_hws;
  76        }
  77        h->batch = vaddr;
  78
  79        return 0;
  80
  81err_unpin_hws:
  82        i915_gem_object_unpin_map(h->hws);
  83err_obj:
  84        i915_gem_object_put(h->obj);
  85err_hws:
  86        i915_gem_object_put(h->hws);
  87err_ctx:
  88        kernel_context_close(h->ctx);
  89        return err;
  90}
  91
  92static u64 hws_address(const struct i915_vma *hws,
  93                       const struct i915_request *rq)
  94{
  95        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  96}
  97
  98static int move_to_active(struct i915_vma *vma,
  99                          struct i915_request *rq,
 100                          unsigned int flags)
 101{
 102        int err;
 103
 104        i915_vma_lock(vma);
 105        err = i915_request_await_object(rq, vma->obj,
 106                                        flags & EXEC_OBJECT_WRITE);
 107        if (err == 0)
 108                err = i915_vma_move_to_active(vma, rq, flags);
 109        i915_vma_unlock(vma);
 110
 111        return err;
 112}
 113
 114static struct i915_request *
 115hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 116{
 117        struct intel_gt *gt = h->gt;
 118        struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
 119        struct drm_i915_gem_object *obj;
 120        struct i915_request *rq = NULL;
 121        struct i915_vma *hws, *vma;
 122        unsigned int flags;
 123        void *vaddr;
 124        u32 *batch;
 125        int err;
 126
 127        obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 128        if (IS_ERR(obj)) {
 129                i915_vm_put(vm);
 130                return ERR_CAST(obj);
 131        }
 132
 133        vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915));
 134        if (IS_ERR(vaddr)) {
 135                i915_gem_object_put(obj);
 136                i915_vm_put(vm);
 137                return ERR_CAST(vaddr);
 138        }
 139
 140        i915_gem_object_unpin_map(h->obj);
 141        i915_gem_object_put(h->obj);
 142
 143        h->obj = obj;
 144        h->batch = vaddr;
 145
 146        vma = i915_vma_instance(h->obj, vm, NULL);
 147        if (IS_ERR(vma)) {
 148                i915_vm_put(vm);
 149                return ERR_CAST(vma);
 150        }
 151
 152        hws = i915_vma_instance(h->hws, vm, NULL);
 153        if (IS_ERR(hws)) {
 154                i915_vm_put(vm);
 155                return ERR_CAST(hws);
 156        }
 157
 158        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 159        if (err) {
 160                i915_vm_put(vm);
 161                return ERR_PTR(err);
 162        }
 163
 164        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 165        if (err)
 166                goto unpin_vma;
 167
 168        rq = igt_request_alloc(h->ctx, engine);
 169        if (IS_ERR(rq)) {
 170                err = PTR_ERR(rq);
 171                goto unpin_hws;
 172        }
 173
 174        err = move_to_active(vma, rq, 0);
 175        if (err)
 176                goto cancel_rq;
 177
 178        err = move_to_active(hws, rq, 0);
 179        if (err)
 180                goto cancel_rq;
 181
 182        batch = h->batch;
 183        if (INTEL_GEN(gt->i915) >= 8) {
 184                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 185                *batch++ = lower_32_bits(hws_address(hws, rq));
 186                *batch++ = upper_32_bits(hws_address(hws, rq));
 187                *batch++ = rq->fence.seqno;
 188                *batch++ = MI_NOOP;
 189
 190                memset(batch, 0, 1024);
 191                batch += 1024 / sizeof(*batch);
 192
 193                *batch++ = MI_NOOP;
 194                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 195                *batch++ = lower_32_bits(vma->node.start);
 196                *batch++ = upper_32_bits(vma->node.start);
 197        } else if (INTEL_GEN(gt->i915) >= 6) {
 198                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 199                *batch++ = 0;
 200                *batch++ = lower_32_bits(hws_address(hws, rq));
 201                *batch++ = rq->fence.seqno;
 202                *batch++ = MI_NOOP;
 203
 204                memset(batch, 0, 1024);
 205                batch += 1024 / sizeof(*batch);
 206
 207                *batch++ = MI_NOOP;
 208                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 209                *batch++ = lower_32_bits(vma->node.start);
 210        } else if (INTEL_GEN(gt->i915) >= 4) {
 211                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 212                *batch++ = 0;
 213                *batch++ = lower_32_bits(hws_address(hws, rq));
 214                *batch++ = rq->fence.seqno;
 215                *batch++ = MI_NOOP;
 216
 217                memset(batch, 0, 1024);
 218                batch += 1024 / sizeof(*batch);
 219
 220                *batch++ = MI_NOOP;
 221                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 222                *batch++ = lower_32_bits(vma->node.start);
 223        } else {
 224                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 225                *batch++ = lower_32_bits(hws_address(hws, rq));
 226                *batch++ = rq->fence.seqno;
 227                *batch++ = MI_NOOP;
 228
 229                memset(batch, 0, 1024);
 230                batch += 1024 / sizeof(*batch);
 231
 232                *batch++ = MI_NOOP;
 233                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 234                *batch++ = lower_32_bits(vma->node.start);
 235        }
 236        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 237        intel_gt_chipset_flush(engine->gt);
 238
 239        if (rq->engine->emit_init_breadcrumb) {
 240                err = rq->engine->emit_init_breadcrumb(rq);
 241                if (err)
 242                        goto cancel_rq;
 243        }
 244
 245        flags = 0;
 246        if (INTEL_GEN(gt->i915) <= 5)
 247                flags |= I915_DISPATCH_SECURE;
 248
 249        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 250
 251cancel_rq:
 252        if (err) {
 253                i915_request_set_error_once(rq, err);
 254                i915_request_add(rq);
 255        }
 256unpin_hws:
 257        i915_vma_unpin(hws);
 258unpin_vma:
 259        i915_vma_unpin(vma);
 260        i915_vm_put(vm);
 261        return err ? ERR_PTR(err) : rq;
 262}
 263
 264static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 265{
 266        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 267}
 268
 269static void hang_fini(struct hang *h)
 270{
 271        *h->batch = MI_BATCH_BUFFER_END;
 272        intel_gt_chipset_flush(h->gt);
 273
 274        i915_gem_object_unpin_map(h->obj);
 275        i915_gem_object_put(h->obj);
 276
 277        i915_gem_object_unpin_map(h->hws);
 278        i915_gem_object_put(h->hws);
 279
 280        kernel_context_close(h->ctx);
 281
 282        igt_flush_test(h->gt->i915);
 283}
 284
 285static bool wait_until_running(struct hang *h, struct i915_request *rq)
 286{
 287        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 288                                               rq->fence.seqno),
 289                             10) &&
 290                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 291                                            rq->fence.seqno),
 292                          1000));
 293}
 294
 295static int igt_hang_sanitycheck(void *arg)
 296{
 297        struct intel_gt *gt = arg;
 298        struct i915_request *rq;
 299        struct intel_engine_cs *engine;
 300        enum intel_engine_id id;
 301        struct hang h;
 302        int err;
 303
 304        /* Basic check that we can execute our hanging batch */
 305
 306        err = hang_init(&h, gt);
 307        if (err)
 308                return err;
 309
 310        for_each_engine(engine, gt, id) {
 311                struct intel_wedge_me w;
 312                long timeout;
 313
 314                if (!intel_engine_can_store_dword(engine))
 315                        continue;
 316
 317                rq = hang_create_request(&h, engine);
 318                if (IS_ERR(rq)) {
 319                        err = PTR_ERR(rq);
 320                        pr_err("Failed to create request for %s, err=%d\n",
 321                               engine->name, err);
 322                        goto fini;
 323                }
 324
 325                i915_request_get(rq);
 326
 327                *h.batch = MI_BATCH_BUFFER_END;
 328                intel_gt_chipset_flush(engine->gt);
 329
 330                i915_request_add(rq);
 331
 332                timeout = 0;
 333                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 334                        timeout = i915_request_wait(rq, 0,
 335                                                    MAX_SCHEDULE_TIMEOUT);
 336                if (intel_gt_is_wedged(gt))
 337                        timeout = -EIO;
 338
 339                i915_request_put(rq);
 340
 341                if (timeout < 0) {
 342                        err = timeout;
 343                        pr_err("Wait for request failed on %s, err=%d\n",
 344                               engine->name, err);
 345                        goto fini;
 346                }
 347        }
 348
 349fini:
 350        hang_fini(&h);
 351        return err;
 352}
 353
 354static bool wait_for_idle(struct intel_engine_cs *engine)
 355{
 356        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 357}
 358
 359static int igt_reset_nop(void *arg)
 360{
 361        struct intel_gt *gt = arg;
 362        struct i915_gpu_error *global = &gt->i915->gpu_error;
 363        struct intel_engine_cs *engine;
 364        unsigned int reset_count, count;
 365        enum intel_engine_id id;
 366        IGT_TIMEOUT(end_time);
 367        int err = 0;
 368
 369        /* Check that we can reset during non-user portions of requests */
 370
 371        reset_count = i915_reset_count(global);
 372        count = 0;
 373        do {
 374                for_each_engine(engine, gt, id) {
 375                        struct intel_context *ce;
 376                        int i;
 377
 378                        ce = intel_context_create(engine);
 379                        if (IS_ERR(ce)) {
 380                                err = PTR_ERR(ce);
 381                                break;
 382                        }
 383
 384                        for (i = 0; i < 16; i++) {
 385                                struct i915_request *rq;
 386
 387                                rq = intel_context_create_request(ce);
 388                                if (IS_ERR(rq)) {
 389                                        err = PTR_ERR(rq);
 390                                        break;
 391                                }
 392
 393                                i915_request_add(rq);
 394                        }
 395
 396                        intel_context_put(ce);
 397                }
 398
 399                igt_global_reset_lock(gt);
 400                intel_gt_reset(gt, ALL_ENGINES, NULL);
 401                igt_global_reset_unlock(gt);
 402
 403                if (intel_gt_is_wedged(gt)) {
 404                        err = -EIO;
 405                        break;
 406                }
 407
 408                if (i915_reset_count(global) != reset_count + ++count) {
 409                        pr_err("Full GPU reset not recorded!\n");
 410                        err = -EINVAL;
 411                        break;
 412                }
 413
 414                err = igt_flush_test(gt->i915);
 415                if (err)
 416                        break;
 417        } while (time_before(jiffies, end_time));
 418        pr_info("%s: %d resets\n", __func__, count);
 419
 420        if (igt_flush_test(gt->i915))
 421                err = -EIO;
 422        return err;
 423}
 424
 425static int igt_reset_nop_engine(void *arg)
 426{
 427        struct intel_gt *gt = arg;
 428        struct i915_gpu_error *global = &gt->i915->gpu_error;
 429        struct intel_engine_cs *engine;
 430        enum intel_engine_id id;
 431
 432        /* Check that we can engine-reset during non-user portions */
 433
 434        if (!intel_has_reset_engine(gt))
 435                return 0;
 436
 437        for_each_engine(engine, gt, id) {
 438                unsigned int reset_count, reset_engine_count, count;
 439                struct intel_context *ce;
 440                IGT_TIMEOUT(end_time);
 441                int err;
 442
 443                ce = intel_context_create(engine);
 444                if (IS_ERR(ce))
 445                        return PTR_ERR(ce);
 446
 447                reset_count = i915_reset_count(global);
 448                reset_engine_count = i915_reset_engine_count(global, engine);
 449                count = 0;
 450
 451                st_engine_heartbeat_disable(engine);
 452                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 453                do {
 454                        int i;
 455
 456                        if (!wait_for_idle(engine)) {
 457                                pr_err("%s failed to idle before reset\n",
 458                                       engine->name);
 459                                err = -EIO;
 460                                break;
 461                        }
 462
 463                        for (i = 0; i < 16; i++) {
 464                                struct i915_request *rq;
 465
 466                                rq = intel_context_create_request(ce);
 467                                if (IS_ERR(rq)) {
 468                                        struct drm_printer p =
 469                                                drm_info_printer(gt->i915->drm.dev);
 470                                        intel_engine_dump(engine, &p,
 471                                                          "%s(%s): failed to submit request\n",
 472                                                          __func__,
 473                                                          engine->name);
 474
 475                                        GEM_TRACE("%s(%s): failed to submit request\n",
 476                                                  __func__,
 477                                                  engine->name);
 478                                        GEM_TRACE_DUMP();
 479
 480                                        intel_gt_set_wedged(gt);
 481
 482                                        err = PTR_ERR(rq);
 483                                        break;
 484                                }
 485
 486                                i915_request_add(rq);
 487                        }
 488                        err = intel_engine_reset(engine, NULL);
 489                        if (err) {
 490                                pr_err("intel_engine_reset(%s) failed, err:%d\n",
 491                                       engine->name, err);
 492                                break;
 493                        }
 494
 495                        if (i915_reset_count(global) != reset_count) {
 496                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 497                                err = -EINVAL;
 498                                break;
 499                        }
 500
 501                        if (i915_reset_engine_count(global, engine) !=
 502                            reset_engine_count + ++count) {
 503                                pr_err("%s engine reset not recorded!\n",
 504                                       engine->name);
 505                                err = -EINVAL;
 506                                break;
 507                        }
 508                } while (time_before(jiffies, end_time));
 509                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 510                st_engine_heartbeat_enable(engine);
 511
 512                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 513
 514                intel_context_put(ce);
 515                if (igt_flush_test(gt->i915))
 516                        err = -EIO;
 517                if (err)
 518                        return err;
 519        }
 520
 521        return 0;
 522}
 523
 524static void force_reset_timeout(struct intel_engine_cs *engine)
 525{
 526        engine->reset_timeout.probability = 999;
 527        atomic_set(&engine->reset_timeout.times, -1);
 528}
 529
 530static void cancel_reset_timeout(struct intel_engine_cs *engine)
 531{
 532        memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
 533}
 534
 535static int igt_reset_fail_engine(void *arg)
 536{
 537        struct intel_gt *gt = arg;
 538        struct intel_engine_cs *engine;
 539        enum intel_engine_id id;
 540
 541        /* Check that we can recover from engine-reset failues */
 542
 543        if (!intel_has_reset_engine(gt))
 544                return 0;
 545
 546        for_each_engine(engine, gt, id) {
 547                unsigned int count;
 548                struct intel_context *ce;
 549                IGT_TIMEOUT(end_time);
 550                int err;
 551
 552                ce = intel_context_create(engine);
 553                if (IS_ERR(ce))
 554                        return PTR_ERR(ce);
 555
 556                st_engine_heartbeat_disable(engine);
 557                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 558
 559                force_reset_timeout(engine);
 560                err = intel_engine_reset(engine, NULL);
 561                cancel_reset_timeout(engine);
 562                if (err == 0) /* timeouts only generated on gen8+ */
 563                        goto skip;
 564
 565                count = 0;
 566                do {
 567                        struct i915_request *last = NULL;
 568                        int i;
 569
 570                        if (!wait_for_idle(engine)) {
 571                                pr_err("%s failed to idle before reset\n",
 572                                       engine->name);
 573                                err = -EIO;
 574                                break;
 575                        }
 576
 577                        for (i = 0; i < count % 15; i++) {
 578                                struct i915_request *rq;
 579
 580                                rq = intel_context_create_request(ce);
 581                                if (IS_ERR(rq)) {
 582                                        struct drm_printer p =
 583                                                drm_info_printer(gt->i915->drm.dev);
 584                                        intel_engine_dump(engine, &p,
 585                                                          "%s(%s): failed to submit request\n",
 586                                                          __func__,
 587                                                          engine->name);
 588
 589                                        GEM_TRACE("%s(%s): failed to submit request\n",
 590                                                  __func__,
 591                                                  engine->name);
 592                                        GEM_TRACE_DUMP();
 593
 594                                        intel_gt_set_wedged(gt);
 595                                        if (last)
 596                                                i915_request_put(last);
 597
 598                                        err = PTR_ERR(rq);
 599                                        goto out;
 600                                }
 601
 602                                if (last)
 603                                        i915_request_put(last);
 604                                last = i915_request_get(rq);
 605                                i915_request_add(rq);
 606                        }
 607
 608                        if (count & 1) {
 609                                err = intel_engine_reset(engine, NULL);
 610                                if (err) {
 611                                        GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
 612                                                      engine->name, err);
 613                                        GEM_TRACE_DUMP();
 614                                        i915_request_put(last);
 615                                        break;
 616                                }
 617                        } else {
 618                                force_reset_timeout(engine);
 619                                err = intel_engine_reset(engine, NULL);
 620                                cancel_reset_timeout(engine);
 621                                if (err != -ETIMEDOUT) {
 622                                        pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
 623                                               engine->name, err);
 624                                        i915_request_put(last);
 625                                        break;
 626                                }
 627                        }
 628
 629                        err = 0;
 630                        if (last) {
 631                                if (i915_request_wait(last, 0, HZ / 2) < 0) {
 632                                        struct drm_printer p =
 633                                                drm_info_printer(gt->i915->drm.dev);
 634
 635                                        intel_engine_dump(engine, &p,
 636                                                          "%s(%s): failed to complete request\n",
 637                                                          __func__,
 638                                                          engine->name);
 639
 640                                        GEM_TRACE("%s(%s): failed to complete request\n",
 641                                                  __func__,
 642                                                  engine->name);
 643                                        GEM_TRACE_DUMP();
 644
 645                                        err = -EIO;
 646                                }
 647                                i915_request_put(last);
 648                        }
 649                        count++;
 650                } while (err == 0 && time_before(jiffies, end_time));
 651out:
 652                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 653skip:
 654                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 655                st_engine_heartbeat_enable(engine);
 656                intel_context_put(ce);
 657
 658                if (igt_flush_test(gt->i915))
 659                        err = -EIO;
 660                if (err)
 661                        return err;
 662        }
 663
 664        return 0;
 665}
 666
 667static int __igt_reset_engine(struct intel_gt *gt, bool active)
 668{
 669        struct i915_gpu_error *global = &gt->i915->gpu_error;
 670        struct intel_engine_cs *engine;
 671        enum intel_engine_id id;
 672        struct hang h;
 673        int err = 0;
 674
 675        /* Check that we can issue an engine reset on an idle engine (no-op) */
 676
 677        if (!intel_has_reset_engine(gt))
 678                return 0;
 679
 680        if (active) {
 681                err = hang_init(&h, gt);
 682                if (err)
 683                        return err;
 684        }
 685
 686        for_each_engine(engine, gt, id) {
 687                unsigned int reset_count, reset_engine_count;
 688                unsigned long count;
 689                IGT_TIMEOUT(end_time);
 690
 691                if (active && !intel_engine_can_store_dword(engine))
 692                        continue;
 693
 694                if (!wait_for_idle(engine)) {
 695                        pr_err("%s failed to idle before reset\n",
 696                               engine->name);
 697                        err = -EIO;
 698                        break;
 699                }
 700
 701                reset_count = i915_reset_count(global);
 702                reset_engine_count = i915_reset_engine_count(global, engine);
 703
 704                st_engine_heartbeat_disable(engine);
 705                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 706                count = 0;
 707                do {
 708                        if (active) {
 709                                struct i915_request *rq;
 710
 711                                rq = hang_create_request(&h, engine);
 712                                if (IS_ERR(rq)) {
 713                                        err = PTR_ERR(rq);
 714                                        break;
 715                                }
 716
 717                                i915_request_get(rq);
 718                                i915_request_add(rq);
 719
 720                                if (!wait_until_running(&h, rq)) {
 721                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 722
 723                                        pr_err("%s: Failed to start request %llx, at %x\n",
 724                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 725                                        intel_engine_dump(engine, &p,
 726                                                          "%s\n", engine->name);
 727
 728                                        i915_request_put(rq);
 729                                        err = -EIO;
 730                                        break;
 731                                }
 732
 733                                i915_request_put(rq);
 734                        }
 735
 736                        err = intel_engine_reset(engine, NULL);
 737                        if (err) {
 738                                pr_err("intel_engine_reset(%s) failed, err:%d\n",
 739                                       engine->name, err);
 740                                break;
 741                        }
 742
 743                        if (i915_reset_count(global) != reset_count) {
 744                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 745                                err = -EINVAL;
 746                                break;
 747                        }
 748
 749                        if (i915_reset_engine_count(global, engine) !=
 750                            ++reset_engine_count) {
 751                                pr_err("%s engine reset not recorded!\n",
 752                                       engine->name);
 753                                err = -EINVAL;
 754                                break;
 755                        }
 756
 757                        count++;
 758                } while (time_before(jiffies, end_time));
 759                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 760                st_engine_heartbeat_enable(engine);
 761                pr_info("%s: Completed %lu %s resets\n",
 762                        engine->name, count, active ? "active" : "idle");
 763
 764                if (err)
 765                        break;
 766
 767                err = igt_flush_test(gt->i915);
 768                if (err)
 769                        break;
 770        }
 771
 772        if (intel_gt_is_wedged(gt))
 773                err = -EIO;
 774
 775        if (active)
 776                hang_fini(&h);
 777
 778        return err;
 779}
 780
 781static int igt_reset_idle_engine(void *arg)
 782{
 783        return __igt_reset_engine(arg, false);
 784}
 785
 786static int igt_reset_active_engine(void *arg)
 787{
 788        return __igt_reset_engine(arg, true);
 789}
 790
 791struct active_engine {
 792        struct task_struct *task;
 793        struct intel_engine_cs *engine;
 794        unsigned long resets;
 795        unsigned int flags;
 796};
 797
 798#define TEST_ACTIVE     BIT(0)
 799#define TEST_OTHERS     BIT(1)
 800#define TEST_SELF       BIT(2)
 801#define TEST_PRIORITY   BIT(3)
 802
 803static int active_request_put(struct i915_request *rq)
 804{
 805        int err = 0;
 806
 807        if (!rq)
 808                return 0;
 809
 810        if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
 811                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 812                          rq->engine->name,
 813                          rq->fence.context,
 814                          rq->fence.seqno);
 815                GEM_TRACE_DUMP();
 816
 817                intel_gt_set_wedged(rq->engine->gt);
 818                err = -EIO;
 819        }
 820
 821        i915_request_put(rq);
 822
 823        return err;
 824}
 825
 826static int active_engine(void *data)
 827{
 828        I915_RND_STATE(prng);
 829        struct active_engine *arg = data;
 830        struct intel_engine_cs *engine = arg->engine;
 831        struct i915_request *rq[8] = {};
 832        struct intel_context *ce[ARRAY_SIZE(rq)];
 833        unsigned long count;
 834        int err = 0;
 835
 836        for (count = 0; count < ARRAY_SIZE(ce); count++) {
 837                ce[count] = intel_context_create(engine);
 838                if (IS_ERR(ce[count])) {
 839                        err = PTR_ERR(ce[count]);
 840                        while (--count)
 841                                intel_context_put(ce[count]);
 842                        return err;
 843                }
 844        }
 845
 846        count = 0;
 847        while (!kthread_should_stop()) {
 848                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 849                struct i915_request *old = rq[idx];
 850                struct i915_request *new;
 851
 852                new = intel_context_create_request(ce[idx]);
 853                if (IS_ERR(new)) {
 854                        err = PTR_ERR(new);
 855                        break;
 856                }
 857
 858                rq[idx] = i915_request_get(new);
 859                i915_request_add(new);
 860
 861                if (engine->schedule && arg->flags & TEST_PRIORITY) {
 862                        struct i915_sched_attr attr = {
 863                                .priority =
 864                                        i915_prandom_u32_max_state(512, &prng),
 865                        };
 866                        engine->schedule(rq[idx], &attr);
 867                }
 868
 869                err = active_request_put(old);
 870                if (err)
 871                        break;
 872
 873                cond_resched();
 874        }
 875
 876        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 877                int err__ = active_request_put(rq[count]);
 878
 879                /* Keep the first error */
 880                if (!err)
 881                        err = err__;
 882
 883                intel_context_put(ce[count]);
 884        }
 885
 886        return err;
 887}
 888
 889static int __igt_reset_engines(struct intel_gt *gt,
 890                               const char *test_name,
 891                               unsigned int flags)
 892{
 893        struct i915_gpu_error *global = &gt->i915->gpu_error;
 894        struct intel_engine_cs *engine, *other;
 895        enum intel_engine_id id, tmp;
 896        struct hang h;
 897        int err = 0;
 898
 899        /* Check that issuing a reset on one engine does not interfere
 900         * with any other engine.
 901         */
 902
 903        if (!intel_has_reset_engine(gt))
 904                return 0;
 905
 906        if (flags & TEST_ACTIVE) {
 907                err = hang_init(&h, gt);
 908                if (err)
 909                        return err;
 910
 911                if (flags & TEST_PRIORITY)
 912                        h.ctx->sched.priority = 1024;
 913        }
 914
 915        for_each_engine(engine, gt, id) {
 916                struct active_engine threads[I915_NUM_ENGINES] = {};
 917                unsigned long device = i915_reset_count(global);
 918                unsigned long count = 0, reported;
 919                IGT_TIMEOUT(end_time);
 920
 921                if (flags & TEST_ACTIVE &&
 922                    !intel_engine_can_store_dword(engine))
 923                        continue;
 924
 925                if (!wait_for_idle(engine)) {
 926                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
 927                               engine->name, test_name);
 928                        err = -EIO;
 929                        break;
 930                }
 931
 932                memset(threads, 0, sizeof(threads));
 933                for_each_engine(other, gt, tmp) {
 934                        struct task_struct *tsk;
 935
 936                        threads[tmp].resets =
 937                                i915_reset_engine_count(global, other);
 938
 939                        if (other == engine && !(flags & TEST_SELF))
 940                                continue;
 941
 942                        if (other != engine && !(flags & TEST_OTHERS))
 943                                continue;
 944
 945                        threads[tmp].engine = other;
 946                        threads[tmp].flags = flags;
 947
 948                        tsk = kthread_run(active_engine, &threads[tmp],
 949                                          "igt/%s", other->name);
 950                        if (IS_ERR(tsk)) {
 951                                err = PTR_ERR(tsk);
 952                                goto unwind;
 953                        }
 954
 955                        threads[tmp].task = tsk;
 956                        get_task_struct(tsk);
 957                }
 958
 959                yield(); /* start all threads before we begin */
 960
 961                st_engine_heartbeat_disable(engine);
 962                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 963                do {
 964                        struct i915_request *rq = NULL;
 965
 966                        if (flags & TEST_ACTIVE) {
 967                                rq = hang_create_request(&h, engine);
 968                                if (IS_ERR(rq)) {
 969                                        err = PTR_ERR(rq);
 970                                        break;
 971                                }
 972
 973                                i915_request_get(rq);
 974                                i915_request_add(rq);
 975
 976                                if (!wait_until_running(&h, rq)) {
 977                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 978
 979                                        pr_err("%s: Failed to start request %llx, at %x\n",
 980                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 981                                        intel_engine_dump(engine, &p,
 982                                                          "%s\n", engine->name);
 983
 984                                        i915_request_put(rq);
 985                                        err = -EIO;
 986                                        break;
 987                                }
 988                        }
 989
 990                        err = intel_engine_reset(engine, NULL);
 991                        if (err) {
 992                                pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
 993                                       engine->name, test_name, err);
 994                                break;
 995                        }
 996
 997                        count++;
 998
 999                        if (rq) {
1000                                if (rq->fence.error != -EIO) {
1001                                        pr_err("i915_reset_engine(%s:%s):"
1002                                               " failed to reset request %llx:%lld\n",
1003                                               engine->name, test_name,
1004                                               rq->fence.context,
1005                                               rq->fence.seqno);
1006                                        i915_request_put(rq);
1007
1008                                        GEM_TRACE_DUMP();
1009                                        intel_gt_set_wedged(gt);
1010                                        err = -EIO;
1011                                        break;
1012                                }
1013
1014                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1015                                        struct drm_printer p =
1016                                                drm_info_printer(gt->i915->drm.dev);
1017
1018                                        pr_err("i915_reset_engine(%s:%s):"
1019                                               " failed to complete request %llx:%lld after reset\n",
1020                                               engine->name, test_name,
1021                                               rq->fence.context,
1022                                               rq->fence.seqno);
1023                                        intel_engine_dump(engine, &p,
1024                                                          "%s\n", engine->name);
1025                                        i915_request_put(rq);
1026
1027                                        GEM_TRACE_DUMP();
1028                                        intel_gt_set_wedged(gt);
1029                                        err = -EIO;
1030                                        break;
1031                                }
1032
1033                                i915_request_put(rq);
1034                        }
1035
1036                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1037                                struct drm_printer p =
1038                                        drm_info_printer(gt->i915->drm.dev);
1039
1040                                pr_err("i915_reset_engine(%s:%s):"
1041                                       " failed to idle after reset\n",
1042                                       engine->name, test_name);
1043                                intel_engine_dump(engine, &p,
1044                                                  "%s\n", engine->name);
1045
1046                                err = -EIO;
1047                                break;
1048                        }
1049                } while (time_before(jiffies, end_time));
1050                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1051                st_engine_heartbeat_enable(engine);
1052
1053                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1054                        engine->name, test_name, count);
1055
1056                reported = i915_reset_engine_count(global, engine);
1057                reported -= threads[engine->id].resets;
1058                if (reported != count) {
1059                        pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1060                               engine->name, test_name, count, reported);
1061                        if (!err)
1062                                err = -EINVAL;
1063                }
1064
1065unwind:
1066                for_each_engine(other, gt, tmp) {
1067                        int ret;
1068
1069                        if (!threads[tmp].task)
1070                                continue;
1071
1072                        ret = kthread_stop(threads[tmp].task);
1073                        if (ret) {
1074                                pr_err("kthread for other engine %s failed, err=%d\n",
1075                                       other->name, ret);
1076                                if (!err)
1077                                        err = ret;
1078                        }
1079                        put_task_struct(threads[tmp].task);
1080
1081                        if (other->uabi_class != engine->uabi_class &&
1082                            threads[tmp].resets !=
1083                            i915_reset_engine_count(global, other)) {
1084                                pr_err("Innocent engine %s was reset (count=%ld)\n",
1085                                       other->name,
1086                                       i915_reset_engine_count(global, other) -
1087                                       threads[tmp].resets);
1088                                if (!err)
1089                                        err = -EINVAL;
1090                        }
1091                }
1092
1093                if (device != i915_reset_count(global)) {
1094                        pr_err("Global reset (count=%ld)!\n",
1095                               i915_reset_count(global) - device);
1096                        if (!err)
1097                                err = -EINVAL;
1098                }
1099
1100                if (err)
1101                        break;
1102
1103                err = igt_flush_test(gt->i915);
1104                if (err)
1105                        break;
1106        }
1107
1108        if (intel_gt_is_wedged(gt))
1109                err = -EIO;
1110
1111        if (flags & TEST_ACTIVE)
1112                hang_fini(&h);
1113
1114        return err;
1115}
1116
1117static int igt_reset_engines(void *arg)
1118{
1119        static const struct {
1120                const char *name;
1121                unsigned int flags;
1122        } phases[] = {
1123                { "idle", 0 },
1124                { "active", TEST_ACTIVE },
1125                { "others-idle", TEST_OTHERS },
1126                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1127                {
1128                        "others-priority",
1129                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1130                },
1131                {
1132                        "self-priority",
1133                        TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1134                },
1135                { }
1136        };
1137        struct intel_gt *gt = arg;
1138        typeof(*phases) *p;
1139        int err;
1140
1141        for (p = phases; p->name; p++) {
1142                if (p->flags & TEST_PRIORITY) {
1143                        if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1144                                continue;
1145                }
1146
1147                err = __igt_reset_engines(arg, p->name, p->flags);
1148                if (err)
1149                        return err;
1150        }
1151
1152        return 0;
1153}
1154
1155static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1156{
1157        u32 count = i915_reset_count(&gt->i915->gpu_error);
1158
1159        intel_gt_reset(gt, mask, NULL);
1160
1161        return count;
1162}
1163
1164static int igt_reset_wait(void *arg)
1165{
1166        struct intel_gt *gt = arg;
1167        struct i915_gpu_error *global = &gt->i915->gpu_error;
1168        struct intel_engine_cs *engine = gt->engine[RCS0];
1169        struct i915_request *rq;
1170        unsigned int reset_count;
1171        struct hang h;
1172        long timeout;
1173        int err;
1174
1175        if (!engine || !intel_engine_can_store_dword(engine))
1176                return 0;
1177
1178        /* Check that we detect a stuck waiter and issue a reset */
1179
1180        igt_global_reset_lock(gt);
1181
1182        err = hang_init(&h, gt);
1183        if (err)
1184                goto unlock;
1185
1186        rq = hang_create_request(&h, engine);
1187        if (IS_ERR(rq)) {
1188                err = PTR_ERR(rq);
1189                goto fini;
1190        }
1191
1192        i915_request_get(rq);
1193        i915_request_add(rq);
1194
1195        if (!wait_until_running(&h, rq)) {
1196                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1197
1198                pr_err("%s: Failed to start request %llx, at %x\n",
1199                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1200                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1201
1202                intel_gt_set_wedged(gt);
1203
1204                err = -EIO;
1205                goto out_rq;
1206        }
1207
1208        reset_count = fake_hangcheck(gt, ALL_ENGINES);
1209
1210        timeout = i915_request_wait(rq, 0, 10);
1211        if (timeout < 0) {
1212                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1213                       timeout);
1214                err = timeout;
1215                goto out_rq;
1216        }
1217
1218        if (i915_reset_count(global) == reset_count) {
1219                pr_err("No GPU reset recorded!\n");
1220                err = -EINVAL;
1221                goto out_rq;
1222        }
1223
1224out_rq:
1225        i915_request_put(rq);
1226fini:
1227        hang_fini(&h);
1228unlock:
1229        igt_global_reset_unlock(gt);
1230
1231        if (intel_gt_is_wedged(gt))
1232                return -EIO;
1233
1234        return err;
1235}
1236
1237struct evict_vma {
1238        struct completion completion;
1239        struct i915_vma *vma;
1240};
1241
1242static int evict_vma(void *data)
1243{
1244        struct evict_vma *arg = data;
1245        struct i915_address_space *vm = arg->vma->vm;
1246        struct drm_mm_node evict = arg->vma->node;
1247        int err;
1248
1249        complete(&arg->completion);
1250
1251        mutex_lock(&vm->mutex);
1252        err = i915_gem_evict_for_node(vm, &evict, 0);
1253        mutex_unlock(&vm->mutex);
1254
1255        return err;
1256}
1257
1258static int evict_fence(void *data)
1259{
1260        struct evict_vma *arg = data;
1261        int err;
1262
1263        complete(&arg->completion);
1264
1265        /* Mark the fence register as dirty to force the mmio update. */
1266        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1267        if (err) {
1268                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1269                return err;
1270        }
1271
1272        err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1273        if (err) {
1274                pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1275                return err;
1276        }
1277
1278        err = i915_vma_pin_fence(arg->vma);
1279        i915_vma_unpin(arg->vma);
1280        if (err) {
1281                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1282                return err;
1283        }
1284
1285        i915_vma_unpin_fence(arg->vma);
1286
1287        return 0;
1288}
1289
1290static int __igt_reset_evict_vma(struct intel_gt *gt,
1291                                 struct i915_address_space *vm,
1292                                 int (*fn)(void *),
1293                                 unsigned int flags)
1294{
1295        struct intel_engine_cs *engine = gt->engine[RCS0];
1296        struct drm_i915_gem_object *obj;
1297        struct task_struct *tsk = NULL;
1298        struct i915_request *rq;
1299        struct evict_vma arg;
1300        struct hang h;
1301        unsigned int pin_flags;
1302        int err;
1303
1304        if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1305                return 0;
1306
1307        if (!engine || !intel_engine_can_store_dword(engine))
1308                return 0;
1309
1310        /* Check that we can recover an unbind stuck on a hanging request */
1311
1312        err = hang_init(&h, gt);
1313        if (err)
1314                return err;
1315
1316        obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1317        if (IS_ERR(obj)) {
1318                err = PTR_ERR(obj);
1319                goto fini;
1320        }
1321
1322        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1323                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1324                if (err) {
1325                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1326                        goto out_obj;
1327                }
1328        }
1329
1330        arg.vma = i915_vma_instance(obj, vm, NULL);
1331        if (IS_ERR(arg.vma)) {
1332                err = PTR_ERR(arg.vma);
1333                goto out_obj;
1334        }
1335
1336        rq = hang_create_request(&h, engine);
1337        if (IS_ERR(rq)) {
1338                err = PTR_ERR(rq);
1339                goto out_obj;
1340        }
1341
1342        pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1343
1344        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1345                pin_flags |= PIN_MAPPABLE;
1346
1347        err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1348        if (err) {
1349                i915_request_add(rq);
1350                goto out_obj;
1351        }
1352
1353        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1354                err = i915_vma_pin_fence(arg.vma);
1355                if (err) {
1356                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1357                        i915_vma_unpin(arg.vma);
1358                        i915_request_add(rq);
1359                        goto out_obj;
1360                }
1361        }
1362
1363        i915_vma_lock(arg.vma);
1364        err = i915_request_await_object(rq, arg.vma->obj,
1365                                        flags & EXEC_OBJECT_WRITE);
1366        if (err == 0)
1367                err = i915_vma_move_to_active(arg.vma, rq, flags);
1368        i915_vma_unlock(arg.vma);
1369
1370        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1371                i915_vma_unpin_fence(arg.vma);
1372        i915_vma_unpin(arg.vma);
1373
1374        i915_request_get(rq);
1375        i915_request_add(rq);
1376        if (err)
1377                goto out_rq;
1378
1379        if (!wait_until_running(&h, rq)) {
1380                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1381
1382                pr_err("%s: Failed to start request %llx, at %x\n",
1383                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1384                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1385
1386                intel_gt_set_wedged(gt);
1387                goto out_reset;
1388        }
1389
1390        init_completion(&arg.completion);
1391
1392        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1393        if (IS_ERR(tsk)) {
1394                err = PTR_ERR(tsk);
1395                tsk = NULL;
1396                goto out_reset;
1397        }
1398        get_task_struct(tsk);
1399
1400        wait_for_completion(&arg.completion);
1401
1402        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1403                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1404
1405                pr_err("igt/evict_vma kthread did not wait\n");
1406                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1407
1408                intel_gt_set_wedged(gt);
1409                goto out_reset;
1410        }
1411
1412out_reset:
1413        igt_global_reset_lock(gt);
1414        fake_hangcheck(gt, rq->engine->mask);
1415        igt_global_reset_unlock(gt);
1416
1417        if (tsk) {
1418                struct intel_wedge_me w;
1419
1420                /* The reset, even indirectly, should take less than 10ms. */
1421                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1422                        err = kthread_stop(tsk);
1423
1424                put_task_struct(tsk);
1425        }
1426
1427out_rq:
1428        i915_request_put(rq);
1429out_obj:
1430        i915_gem_object_put(obj);
1431fini:
1432        hang_fini(&h);
1433        if (intel_gt_is_wedged(gt))
1434                return -EIO;
1435
1436        return err;
1437}
1438
1439static int igt_reset_evict_ggtt(void *arg)
1440{
1441        struct intel_gt *gt = arg;
1442
1443        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1444                                     evict_vma, EXEC_OBJECT_WRITE);
1445}
1446
1447static int igt_reset_evict_ppgtt(void *arg)
1448{
1449        struct intel_gt *gt = arg;
1450        struct i915_ppgtt *ppgtt;
1451        int err;
1452
1453        /* aliasing == global gtt locking, covered above */
1454        if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1455                return 0;
1456
1457        ppgtt = i915_ppgtt_create(gt);
1458        if (IS_ERR(ppgtt))
1459                return PTR_ERR(ppgtt);
1460
1461        err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1462                                    evict_vma, EXEC_OBJECT_WRITE);
1463        i915_vm_put(&ppgtt->vm);
1464
1465        return err;
1466}
1467
1468static int igt_reset_evict_fence(void *arg)
1469{
1470        struct intel_gt *gt = arg;
1471
1472        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1473                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1474}
1475
1476static int wait_for_others(struct intel_gt *gt,
1477                           struct intel_engine_cs *exclude)
1478{
1479        struct intel_engine_cs *engine;
1480        enum intel_engine_id id;
1481
1482        for_each_engine(engine, gt, id) {
1483                if (engine == exclude)
1484                        continue;
1485
1486                if (!wait_for_idle(engine))
1487                        return -EIO;
1488        }
1489
1490        return 0;
1491}
1492
1493static int igt_reset_queue(void *arg)
1494{
1495        struct intel_gt *gt = arg;
1496        struct i915_gpu_error *global = &gt->i915->gpu_error;
1497        struct intel_engine_cs *engine;
1498        enum intel_engine_id id;
1499        struct hang h;
1500        int err;
1501
1502        /* Check that we replay pending requests following a hang */
1503
1504        igt_global_reset_lock(gt);
1505
1506        err = hang_init(&h, gt);
1507        if (err)
1508                goto unlock;
1509
1510        for_each_engine(engine, gt, id) {
1511                struct i915_request *prev;
1512                IGT_TIMEOUT(end_time);
1513                unsigned int count;
1514
1515                if (!intel_engine_can_store_dword(engine))
1516                        continue;
1517
1518                prev = hang_create_request(&h, engine);
1519                if (IS_ERR(prev)) {
1520                        err = PTR_ERR(prev);
1521                        goto fini;
1522                }
1523
1524                i915_request_get(prev);
1525                i915_request_add(prev);
1526
1527                count = 0;
1528                do {
1529                        struct i915_request *rq;
1530                        unsigned int reset_count;
1531
1532                        rq = hang_create_request(&h, engine);
1533                        if (IS_ERR(rq)) {
1534                                err = PTR_ERR(rq);
1535                                goto fini;
1536                        }
1537
1538                        i915_request_get(rq);
1539                        i915_request_add(rq);
1540
1541                        /*
1542                         * XXX We don't handle resetting the kernel context
1543                         * very well. If we trigger a device reset twice in
1544                         * quick succession while the kernel context is
1545                         * executing, we may end up skipping the breadcrumb.
1546                         * This is really only a problem for the selftest as
1547                         * normally there is a large interlude between resets
1548                         * (hangcheck), or we focus on resetting just one
1549                         * engine and so avoid repeatedly resetting innocents.
1550                         */
1551                        err = wait_for_others(gt, engine);
1552                        if (err) {
1553                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1554                                       __func__, engine->name);
1555                                i915_request_put(rq);
1556                                i915_request_put(prev);
1557
1558                                GEM_TRACE_DUMP();
1559                                intel_gt_set_wedged(gt);
1560                                goto fini;
1561                        }
1562
1563                        if (!wait_until_running(&h, prev)) {
1564                                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1565
1566                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1567                                       __func__, engine->name,
1568                                       prev->fence.seqno, hws_seqno(&h, prev));
1569                                intel_engine_dump(engine, &p,
1570                                                  "%s\n", engine->name);
1571
1572                                i915_request_put(rq);
1573                                i915_request_put(prev);
1574
1575                                intel_gt_set_wedged(gt);
1576
1577                                err = -EIO;
1578                                goto fini;
1579                        }
1580
1581                        reset_count = fake_hangcheck(gt, BIT(id));
1582
1583                        if (prev->fence.error != -EIO) {
1584                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1585                                       prev->fence.error);
1586                                i915_request_put(rq);
1587                                i915_request_put(prev);
1588                                err = -EINVAL;
1589                                goto fini;
1590                        }
1591
1592                        if (rq->fence.error) {
1593                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1594                                       rq->fence.error);
1595                                i915_request_put(rq);
1596                                i915_request_put(prev);
1597                                err = -EINVAL;
1598                                goto fini;
1599                        }
1600
1601                        if (i915_reset_count(global) == reset_count) {
1602                                pr_err("No GPU reset recorded!\n");
1603                                i915_request_put(rq);
1604                                i915_request_put(prev);
1605                                err = -EINVAL;
1606                                goto fini;
1607                        }
1608
1609                        i915_request_put(prev);
1610                        prev = rq;
1611                        count++;
1612                } while (time_before(jiffies, end_time));
1613                pr_info("%s: Completed %d queued resets\n",
1614                        engine->name, count);
1615
1616                *h.batch = MI_BATCH_BUFFER_END;
1617                intel_gt_chipset_flush(engine->gt);
1618
1619                i915_request_put(prev);
1620
1621                err = igt_flush_test(gt->i915);
1622                if (err)
1623                        break;
1624        }
1625
1626fini:
1627        hang_fini(&h);
1628unlock:
1629        igt_global_reset_unlock(gt);
1630
1631        if (intel_gt_is_wedged(gt))
1632                return -EIO;
1633
1634        return err;
1635}
1636
1637static int igt_handle_error(void *arg)
1638{
1639        struct intel_gt *gt = arg;
1640        struct i915_gpu_error *global = &gt->i915->gpu_error;
1641        struct intel_engine_cs *engine = gt->engine[RCS0];
1642        struct hang h;
1643        struct i915_request *rq;
1644        struct i915_gpu_coredump *error;
1645        int err;
1646
1647        /* Check that we can issue a global GPU and engine reset */
1648
1649        if (!intel_has_reset_engine(gt))
1650                return 0;
1651
1652        if (!engine || !intel_engine_can_store_dword(engine))
1653                return 0;
1654
1655        err = hang_init(&h, gt);
1656        if (err)
1657                return err;
1658
1659        rq = hang_create_request(&h, engine);
1660        if (IS_ERR(rq)) {
1661                err = PTR_ERR(rq);
1662                goto err_fini;
1663        }
1664
1665        i915_request_get(rq);
1666        i915_request_add(rq);
1667
1668        if (!wait_until_running(&h, rq)) {
1669                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1670
1671                pr_err("%s: Failed to start request %llx, at %x\n",
1672                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1673                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1674
1675                intel_gt_set_wedged(gt);
1676
1677                err = -EIO;
1678                goto err_request;
1679        }
1680
1681        /* Temporarily disable error capture */
1682        error = xchg(&global->first_error, (void *)-1);
1683
1684        intel_gt_handle_error(gt, engine->mask, 0, NULL);
1685
1686        xchg(&global->first_error, error);
1687
1688        if (rq->fence.error != -EIO) {
1689                pr_err("Guilty request not identified!\n");
1690                err = -EINVAL;
1691                goto err_request;
1692        }
1693
1694err_request:
1695        i915_request_put(rq);
1696err_fini:
1697        hang_fini(&h);
1698        return err;
1699}
1700
1701static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1702                                     const struct igt_atomic_section *p,
1703                                     const char *mode)
1704{
1705        struct tasklet_struct * const t = &engine->execlists.tasklet;
1706        int err;
1707
1708        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1709                  engine->name, mode, p->name);
1710
1711        if (t->func)
1712                tasklet_disable(t);
1713        if (strcmp(p->name, "softirq"))
1714                local_bh_disable();
1715        p->critical_section_begin();
1716
1717        err = __intel_engine_reset_bh(engine, NULL);
1718
1719        p->critical_section_end();
1720        if (strcmp(p->name, "softirq"))
1721                local_bh_enable();
1722        if (t->func) {
1723                tasklet_enable(t);
1724                tasklet_hi_schedule(t);
1725        }
1726
1727        if (err)
1728                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1729                       engine->name, mode, p->name);
1730
1731        return err;
1732}
1733
1734static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1735                                   const struct igt_atomic_section *p)
1736{
1737        struct i915_request *rq;
1738        struct hang h;
1739        int err;
1740
1741        err = __igt_atomic_reset_engine(engine, p, "idle");
1742        if (err)
1743                return err;
1744
1745        err = hang_init(&h, engine->gt);
1746        if (err)
1747                return err;
1748
1749        rq = hang_create_request(&h, engine);
1750        if (IS_ERR(rq)) {
1751                err = PTR_ERR(rq);
1752                goto out;
1753        }
1754
1755        i915_request_get(rq);
1756        i915_request_add(rq);
1757
1758        if (wait_until_running(&h, rq)) {
1759                err = __igt_atomic_reset_engine(engine, p, "active");
1760        } else {
1761                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1762                       __func__, engine->name,
1763                       rq->fence.seqno, hws_seqno(&h, rq));
1764                intel_gt_set_wedged(engine->gt);
1765                err = -EIO;
1766        }
1767
1768        if (err == 0) {
1769                struct intel_wedge_me w;
1770
1771                intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1772                        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1773                if (intel_gt_is_wedged(engine->gt))
1774                        err = -EIO;
1775        }
1776
1777        i915_request_put(rq);
1778out:
1779        hang_fini(&h);
1780        return err;
1781}
1782
1783static int igt_reset_engines_atomic(void *arg)
1784{
1785        struct intel_gt *gt = arg;
1786        const typeof(*igt_atomic_phases) *p;
1787        int err = 0;
1788
1789        /* Check that the engines resets are usable from atomic context */
1790
1791        if (!intel_has_reset_engine(gt))
1792                return 0;
1793
1794        if (intel_uc_uses_guc_submission(&gt->uc))
1795                return 0;
1796
1797        igt_global_reset_lock(gt);
1798
1799        /* Flush any requests before we get started and check basics */
1800        if (!igt_force_reset(gt))
1801                goto unlock;
1802
1803        for (p = igt_atomic_phases; p->name; p++) {
1804                struct intel_engine_cs *engine;
1805                enum intel_engine_id id;
1806
1807                for_each_engine(engine, gt, id) {
1808                        err = igt_atomic_reset_engine(engine, p);
1809                        if (err)
1810                                goto out;
1811                }
1812        }
1813
1814out:
1815        /* As we poke around the guts, do a full reset before continuing. */
1816        igt_force_reset(gt);
1817unlock:
1818        igt_global_reset_unlock(gt);
1819
1820        return err;
1821}
1822
1823int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1824{
1825        static const struct i915_subtest tests[] = {
1826                SUBTEST(igt_hang_sanitycheck),
1827                SUBTEST(igt_reset_nop),
1828                SUBTEST(igt_reset_nop_engine),
1829                SUBTEST(igt_reset_idle_engine),
1830                SUBTEST(igt_reset_active_engine),
1831                SUBTEST(igt_reset_fail_engine),
1832                SUBTEST(igt_reset_engines),
1833                SUBTEST(igt_reset_engines_atomic),
1834                SUBTEST(igt_reset_queue),
1835                SUBTEST(igt_reset_wait),
1836                SUBTEST(igt_reset_evict_ggtt),
1837                SUBTEST(igt_reset_evict_ppgtt),
1838                SUBTEST(igt_reset_evict_fence),
1839                SUBTEST(igt_handle_error),
1840        };
1841        struct intel_gt *gt = &i915->gt;
1842        intel_wakeref_t wakeref;
1843        int err;
1844
1845        if (!intel_has_gpu_reset(gt))
1846                return 0;
1847
1848        if (intel_gt_is_wedged(gt))
1849                return -EIO; /* we're long past hope of a successful reset */
1850
1851        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1852
1853        err = intel_gt_live_subtests(tests, gt);
1854
1855        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1856
1857        return err;
1858}
1859