linux/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
<<
>>
Prefs
   1// SPDX-License-Identifier: MIT
   2/*
   3 * Copyright © 2016 Intel Corporation
   4 */
   5
   6#include <linux/kthread.h>
   7
   8#include "gem/i915_gem_context.h"
   9
  10#include "intel_gt.h"
  11#include "intel_engine_heartbeat.h"
  12#include "intel_engine_pm.h"
  13#include "selftest_engine_heartbeat.h"
  14
  15#include "i915_selftest.h"
  16#include "selftests/i915_random.h"
  17#include "selftests/igt_flush_test.h"
  18#include "selftests/igt_reset.h"
  19#include "selftests/igt_atomic.h"
  20#include "selftests/igt_spinner.h"
  21#include "selftests/intel_scheduler_helpers.h"
  22
  23#include "selftests/mock_drm.h"
  24
  25#include "gem/selftests/mock_context.h"
  26#include "gem/selftests/igt_gem_utils.h"
  27
  28#define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
  29
  30struct hang {
  31        struct intel_gt *gt;
  32        struct drm_i915_gem_object *hws;
  33        struct drm_i915_gem_object *obj;
  34        struct i915_gem_context *ctx;
  35        u32 *seqno;
  36        u32 *batch;
  37};
  38
  39static int hang_init(struct hang *h, struct intel_gt *gt)
  40{
  41        void *vaddr;
  42        int err;
  43
  44        memset(h, 0, sizeof(*h));
  45        h->gt = gt;
  46
  47        h->ctx = kernel_context(gt->i915, NULL);
  48        if (IS_ERR(h->ctx))
  49                return PTR_ERR(h->ctx);
  50
  51        GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
  52
  53        h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  54        if (IS_ERR(h->hws)) {
  55                err = PTR_ERR(h->hws);
  56                goto err_ctx;
  57        }
  58
  59        h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
  60        if (IS_ERR(h->obj)) {
  61                err = PTR_ERR(h->obj);
  62                goto err_hws;
  63        }
  64
  65        i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
  66        vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
  67        if (IS_ERR(vaddr)) {
  68                err = PTR_ERR(vaddr);
  69                goto err_obj;
  70        }
  71        h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
  72
  73        vaddr = i915_gem_object_pin_map_unlocked(h->obj,
  74                                                 i915_coherent_map_type(gt->i915, h->obj, false));
  75        if (IS_ERR(vaddr)) {
  76                err = PTR_ERR(vaddr);
  77                goto err_unpin_hws;
  78        }
  79        h->batch = vaddr;
  80
  81        return 0;
  82
  83err_unpin_hws:
  84        i915_gem_object_unpin_map(h->hws);
  85err_obj:
  86        i915_gem_object_put(h->obj);
  87err_hws:
  88        i915_gem_object_put(h->hws);
  89err_ctx:
  90        kernel_context_close(h->ctx);
  91        return err;
  92}
  93
  94static u64 hws_address(const struct i915_vma *hws,
  95                       const struct i915_request *rq)
  96{
  97        return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
  98}
  99
 100static int move_to_active(struct i915_vma *vma,
 101                          struct i915_request *rq,
 102                          unsigned int flags)
 103{
 104        int err;
 105
 106        i915_vma_lock(vma);
 107        err = i915_request_await_object(rq, vma->obj,
 108                                        flags & EXEC_OBJECT_WRITE);
 109        if (err == 0)
 110                err = i915_vma_move_to_active(vma, rq, flags);
 111        i915_vma_unlock(vma);
 112
 113        return err;
 114}
 115
 116static struct i915_request *
 117hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 118{
 119        struct intel_gt *gt = h->gt;
 120        struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
 121        struct drm_i915_gem_object *obj;
 122        struct i915_request *rq = NULL;
 123        struct i915_vma *hws, *vma;
 124        unsigned int flags;
 125        void *vaddr;
 126        u32 *batch;
 127        int err;
 128
 129        obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
 130        if (IS_ERR(obj)) {
 131                i915_vm_put(vm);
 132                return ERR_CAST(obj);
 133        }
 134
 135        vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
 136        if (IS_ERR(vaddr)) {
 137                i915_gem_object_put(obj);
 138                i915_vm_put(vm);
 139                return ERR_CAST(vaddr);
 140        }
 141
 142        i915_gem_object_unpin_map(h->obj);
 143        i915_gem_object_put(h->obj);
 144
 145        h->obj = obj;
 146        h->batch = vaddr;
 147
 148        vma = i915_vma_instance(h->obj, vm, NULL);
 149        if (IS_ERR(vma)) {
 150                i915_vm_put(vm);
 151                return ERR_CAST(vma);
 152        }
 153
 154        hws = i915_vma_instance(h->hws, vm, NULL);
 155        if (IS_ERR(hws)) {
 156                i915_vm_put(vm);
 157                return ERR_CAST(hws);
 158        }
 159
 160        err = i915_vma_pin(vma, 0, 0, PIN_USER);
 161        if (err) {
 162                i915_vm_put(vm);
 163                return ERR_PTR(err);
 164        }
 165
 166        err = i915_vma_pin(hws, 0, 0, PIN_USER);
 167        if (err)
 168                goto unpin_vma;
 169
 170        rq = igt_request_alloc(h->ctx, engine);
 171        if (IS_ERR(rq)) {
 172                err = PTR_ERR(rq);
 173                goto unpin_hws;
 174        }
 175
 176        err = move_to_active(vma, rq, 0);
 177        if (err)
 178                goto cancel_rq;
 179
 180        err = move_to_active(hws, rq, 0);
 181        if (err)
 182                goto cancel_rq;
 183
 184        batch = h->batch;
 185        if (GRAPHICS_VER(gt->i915) >= 8) {
 186                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 187                *batch++ = lower_32_bits(hws_address(hws, rq));
 188                *batch++ = upper_32_bits(hws_address(hws, rq));
 189                *batch++ = rq->fence.seqno;
 190                *batch++ = MI_NOOP;
 191
 192                memset(batch, 0, 1024);
 193                batch += 1024 / sizeof(*batch);
 194
 195                *batch++ = MI_NOOP;
 196                *batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
 197                *batch++ = lower_32_bits(vma->node.start);
 198                *batch++ = upper_32_bits(vma->node.start);
 199        } else if (GRAPHICS_VER(gt->i915) >= 6) {
 200                *batch++ = MI_STORE_DWORD_IMM_GEN4;
 201                *batch++ = 0;
 202                *batch++ = lower_32_bits(hws_address(hws, rq));
 203                *batch++ = rq->fence.seqno;
 204                *batch++ = MI_NOOP;
 205
 206                memset(batch, 0, 1024);
 207                batch += 1024 / sizeof(*batch);
 208
 209                *batch++ = MI_NOOP;
 210                *batch++ = MI_BATCH_BUFFER_START | 1 << 8;
 211                *batch++ = lower_32_bits(vma->node.start);
 212        } else if (GRAPHICS_VER(gt->i915) >= 4) {
 213                *batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
 214                *batch++ = 0;
 215                *batch++ = lower_32_bits(hws_address(hws, rq));
 216                *batch++ = rq->fence.seqno;
 217                *batch++ = MI_NOOP;
 218
 219                memset(batch, 0, 1024);
 220                batch += 1024 / sizeof(*batch);
 221
 222                *batch++ = MI_NOOP;
 223                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 224                *batch++ = lower_32_bits(vma->node.start);
 225        } else {
 226                *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
 227                *batch++ = lower_32_bits(hws_address(hws, rq));
 228                *batch++ = rq->fence.seqno;
 229                *batch++ = MI_NOOP;
 230
 231                memset(batch, 0, 1024);
 232                batch += 1024 / sizeof(*batch);
 233
 234                *batch++ = MI_NOOP;
 235                *batch++ = MI_BATCH_BUFFER_START | 2 << 6;
 236                *batch++ = lower_32_bits(vma->node.start);
 237        }
 238        *batch++ = MI_BATCH_BUFFER_END; /* not reached */
 239        intel_gt_chipset_flush(engine->gt);
 240
 241        if (rq->engine->emit_init_breadcrumb) {
 242                err = rq->engine->emit_init_breadcrumb(rq);
 243                if (err)
 244                        goto cancel_rq;
 245        }
 246
 247        flags = 0;
 248        if (GRAPHICS_VER(gt->i915) <= 5)
 249                flags |= I915_DISPATCH_SECURE;
 250
 251        err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 252
 253cancel_rq:
 254        if (err) {
 255                i915_request_set_error_once(rq, err);
 256                i915_request_add(rq);
 257        }
 258unpin_hws:
 259        i915_vma_unpin(hws);
 260unpin_vma:
 261        i915_vma_unpin(vma);
 262        i915_vm_put(vm);
 263        return err ? ERR_PTR(err) : rq;
 264}
 265
 266static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
 267{
 268        return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 269}
 270
 271static void hang_fini(struct hang *h)
 272{
 273        *h->batch = MI_BATCH_BUFFER_END;
 274        intel_gt_chipset_flush(h->gt);
 275
 276        i915_gem_object_unpin_map(h->obj);
 277        i915_gem_object_put(h->obj);
 278
 279        i915_gem_object_unpin_map(h->hws);
 280        i915_gem_object_put(h->hws);
 281
 282        kernel_context_close(h->ctx);
 283
 284        igt_flush_test(h->gt->i915);
 285}
 286
 287static bool wait_until_running(struct hang *h, struct i915_request *rq)
 288{
 289        return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
 290                                               rq->fence.seqno),
 291                             10) &&
 292                 wait_for(i915_seqno_passed(hws_seqno(h, rq),
 293                                            rq->fence.seqno),
 294                          1000));
 295}
 296
 297static int igt_hang_sanitycheck(void *arg)
 298{
 299        struct intel_gt *gt = arg;
 300        struct i915_request *rq;
 301        struct intel_engine_cs *engine;
 302        enum intel_engine_id id;
 303        struct hang h;
 304        int err;
 305
 306        /* Basic check that we can execute our hanging batch */
 307
 308        err = hang_init(&h, gt);
 309        if (err)
 310                return err;
 311
 312        for_each_engine(engine, gt, id) {
 313                struct intel_wedge_me w;
 314                long timeout;
 315
 316                if (!intel_engine_can_store_dword(engine))
 317                        continue;
 318
 319                rq = hang_create_request(&h, engine);
 320                if (IS_ERR(rq)) {
 321                        err = PTR_ERR(rq);
 322                        pr_err("Failed to create request for %s, err=%d\n",
 323                               engine->name, err);
 324                        goto fini;
 325                }
 326
 327                i915_request_get(rq);
 328
 329                *h.batch = MI_BATCH_BUFFER_END;
 330                intel_gt_chipset_flush(engine->gt);
 331
 332                i915_request_add(rq);
 333
 334                timeout = 0;
 335                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
 336                        timeout = i915_request_wait(rq, 0,
 337                                                    MAX_SCHEDULE_TIMEOUT);
 338                if (intel_gt_is_wedged(gt))
 339                        timeout = -EIO;
 340
 341                i915_request_put(rq);
 342
 343                if (timeout < 0) {
 344                        err = timeout;
 345                        pr_err("Wait for request failed on %s, err=%d\n",
 346                               engine->name, err);
 347                        goto fini;
 348                }
 349        }
 350
 351fini:
 352        hang_fini(&h);
 353        return err;
 354}
 355
 356static bool wait_for_idle(struct intel_engine_cs *engine)
 357{
 358        return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
 359}
 360
 361static int igt_reset_nop(void *arg)
 362{
 363        struct intel_gt *gt = arg;
 364        struct i915_gpu_error *global = &gt->i915->gpu_error;
 365        struct intel_engine_cs *engine;
 366        unsigned int reset_count, count;
 367        enum intel_engine_id id;
 368        IGT_TIMEOUT(end_time);
 369        int err = 0;
 370
 371        /* Check that we can reset during non-user portions of requests */
 372
 373        reset_count = i915_reset_count(global);
 374        count = 0;
 375        do {
 376                for_each_engine(engine, gt, id) {
 377                        struct intel_context *ce;
 378                        int i;
 379
 380                        ce = intel_context_create(engine);
 381                        if (IS_ERR(ce)) {
 382                                err = PTR_ERR(ce);
 383                                pr_err("[%s] Create context failed: %d!\n", engine->name, err);
 384                                break;
 385                        }
 386
 387                        for (i = 0; i < 16; i++) {
 388                                struct i915_request *rq;
 389
 390                                rq = intel_context_create_request(ce);
 391                                if (IS_ERR(rq)) {
 392                                        err = PTR_ERR(rq);
 393                                        pr_err("[%s] Create request failed: %d!\n",
 394                                               engine->name, err);
 395                                        break;
 396                                }
 397
 398                                i915_request_add(rq);
 399                        }
 400
 401                        intel_context_put(ce);
 402                }
 403
 404                igt_global_reset_lock(gt);
 405                intel_gt_reset(gt, ALL_ENGINES, NULL);
 406                igt_global_reset_unlock(gt);
 407
 408                if (intel_gt_is_wedged(gt)) {
 409                        pr_err("[%s] GT is wedged!\n", engine->name);
 410                        err = -EIO;
 411                        break;
 412                }
 413
 414                if (i915_reset_count(global) != reset_count + ++count) {
 415                        pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
 416                               engine->name, i915_reset_count(global), reset_count, count);
 417                        err = -EINVAL;
 418                        break;
 419                }
 420
 421                err = igt_flush_test(gt->i915);
 422                if (err) {
 423                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
 424                        break;
 425                }
 426        } while (time_before(jiffies, end_time));
 427        pr_info("%s: %d resets\n", __func__, count);
 428
 429        if (igt_flush_test(gt->i915)) {
 430                pr_err("Post flush failed: %d!\n", err);
 431                err = -EIO;
 432        }
 433
 434        return err;
 435}
 436
 437static int igt_reset_nop_engine(void *arg)
 438{
 439        struct intel_gt *gt = arg;
 440        struct i915_gpu_error *global = &gt->i915->gpu_error;
 441        struct intel_engine_cs *engine;
 442        enum intel_engine_id id;
 443
 444        /* Check that we can engine-reset during non-user portions */
 445
 446        if (!intel_has_reset_engine(gt))
 447                return 0;
 448
 449        for_each_engine(engine, gt, id) {
 450                unsigned int reset_count, reset_engine_count, count;
 451                struct intel_context *ce;
 452                IGT_TIMEOUT(end_time);
 453                int err;
 454
 455                if (intel_engine_uses_guc(engine)) {
 456                        /* Engine level resets are triggered by GuC when a hang
 457                         * is detected. They can't be triggered by the KMD any
 458                         * more. Thus a nop batch cannot be used as a reset test
 459                         */
 460                        continue;
 461                }
 462
 463                ce = intel_context_create(engine);
 464                if (IS_ERR(ce)) {
 465                        pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
 466                        return PTR_ERR(ce);
 467                }
 468
 469                reset_count = i915_reset_count(global);
 470                reset_engine_count = i915_reset_engine_count(global, engine);
 471                count = 0;
 472
 473                st_engine_heartbeat_disable(engine);
 474                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 475                do {
 476                        int i;
 477
 478                        if (!wait_for_idle(engine)) {
 479                                pr_err("%s failed to idle before reset\n",
 480                                       engine->name);
 481                                err = -EIO;
 482                                break;
 483                        }
 484
 485                        for (i = 0; i < 16; i++) {
 486                                struct i915_request *rq;
 487
 488                                rq = intel_context_create_request(ce);
 489                                if (IS_ERR(rq)) {
 490                                        struct drm_printer p =
 491                                                drm_info_printer(gt->i915->drm.dev);
 492                                        intel_engine_dump(engine, &p,
 493                                                          "%s(%s): failed to submit request\n",
 494                                                          __func__,
 495                                                          engine->name);
 496
 497                                        GEM_TRACE("%s(%s): failed to submit request\n",
 498                                                  __func__,
 499                                                  engine->name);
 500                                        GEM_TRACE_DUMP();
 501
 502                                        intel_gt_set_wedged(gt);
 503
 504                                        err = PTR_ERR(rq);
 505                                        break;
 506                                }
 507
 508                                i915_request_add(rq);
 509                        }
 510                        err = intel_engine_reset(engine, NULL);
 511                        if (err) {
 512                                pr_err("intel_engine_reset(%s) failed, err:%d\n",
 513                                       engine->name, err);
 514                                break;
 515                        }
 516
 517                        if (i915_reset_count(global) != reset_count) {
 518                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 519                                err = -EINVAL;
 520                                break;
 521                        }
 522
 523                        if (i915_reset_engine_count(global, engine) !=
 524                            reset_engine_count + ++count) {
 525                                pr_err("%s engine reset not recorded!\n",
 526                                       engine->name);
 527                                err = -EINVAL;
 528                                break;
 529                        }
 530                } while (time_before(jiffies, end_time));
 531                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 532                st_engine_heartbeat_enable(engine);
 533
 534                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 535
 536                intel_context_put(ce);
 537                if (igt_flush_test(gt->i915))
 538                        err = -EIO;
 539                if (err)
 540                        return err;
 541        }
 542
 543        return 0;
 544}
 545
 546static void force_reset_timeout(struct intel_engine_cs *engine)
 547{
 548        engine->reset_timeout.probability = 999;
 549        atomic_set(&engine->reset_timeout.times, -1);
 550}
 551
 552static void cancel_reset_timeout(struct intel_engine_cs *engine)
 553{
 554        memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
 555}
 556
 557static int igt_reset_fail_engine(void *arg)
 558{
 559        struct intel_gt *gt = arg;
 560        struct intel_engine_cs *engine;
 561        enum intel_engine_id id;
 562
 563        /* Check that we can recover from engine-reset failues */
 564
 565        if (!intel_has_reset_engine(gt))
 566                return 0;
 567
 568        for_each_engine(engine, gt, id) {
 569                unsigned int count;
 570                struct intel_context *ce;
 571                IGT_TIMEOUT(end_time);
 572                int err;
 573
 574                /* Can't manually break the reset if i915 doesn't perform it */
 575                if (intel_engine_uses_guc(engine))
 576                        continue;
 577
 578                ce = intel_context_create(engine);
 579                if (IS_ERR(ce)) {
 580                        pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
 581                        return PTR_ERR(ce);
 582                }
 583
 584                st_engine_heartbeat_disable(engine);
 585                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 586
 587                force_reset_timeout(engine);
 588                err = intel_engine_reset(engine, NULL);
 589                cancel_reset_timeout(engine);
 590                if (err == 0) /* timeouts only generated on gen8+ */
 591                        goto skip;
 592
 593                count = 0;
 594                do {
 595                        struct i915_request *last = NULL;
 596                        int i;
 597
 598                        if (!wait_for_idle(engine)) {
 599                                pr_err("%s failed to idle before reset\n",
 600                                       engine->name);
 601                                err = -EIO;
 602                                break;
 603                        }
 604
 605                        for (i = 0; i < count % 15; i++) {
 606                                struct i915_request *rq;
 607
 608                                rq = intel_context_create_request(ce);
 609                                if (IS_ERR(rq)) {
 610                                        struct drm_printer p =
 611                                                drm_info_printer(gt->i915->drm.dev);
 612                                        intel_engine_dump(engine, &p,
 613                                                          "%s(%s): failed to submit request\n",
 614                                                          __func__,
 615                                                          engine->name);
 616
 617                                        GEM_TRACE("%s(%s): failed to submit request\n",
 618                                                  __func__,
 619                                                  engine->name);
 620                                        GEM_TRACE_DUMP();
 621
 622                                        intel_gt_set_wedged(gt);
 623                                        if (last)
 624                                                i915_request_put(last);
 625
 626                                        err = PTR_ERR(rq);
 627                                        goto out;
 628                                }
 629
 630                                if (last)
 631                                        i915_request_put(last);
 632                                last = i915_request_get(rq);
 633                                i915_request_add(rq);
 634                        }
 635
 636                        if (count & 1) {
 637                                err = intel_engine_reset(engine, NULL);
 638                                if (err) {
 639                                        GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
 640                                                      engine->name, err);
 641                                        GEM_TRACE_DUMP();
 642                                        i915_request_put(last);
 643                                        break;
 644                                }
 645                        } else {
 646                                force_reset_timeout(engine);
 647                                err = intel_engine_reset(engine, NULL);
 648                                cancel_reset_timeout(engine);
 649                                if (err != -ETIMEDOUT) {
 650                                        pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
 651                                               engine->name, err);
 652                                        i915_request_put(last);
 653                                        break;
 654                                }
 655                        }
 656
 657                        err = 0;
 658                        if (last) {
 659                                if (i915_request_wait(last, 0, HZ / 2) < 0) {
 660                                        struct drm_printer p =
 661                                                drm_info_printer(gt->i915->drm.dev);
 662
 663                                        intel_engine_dump(engine, &p,
 664                                                          "%s(%s): failed to complete request\n",
 665                                                          __func__,
 666                                                          engine->name);
 667
 668                                        GEM_TRACE("%s(%s): failed to complete request\n",
 669                                                  __func__,
 670                                                  engine->name);
 671                                        GEM_TRACE_DUMP();
 672
 673                                        err = -EIO;
 674                                }
 675                                i915_request_put(last);
 676                        }
 677                        count++;
 678                } while (err == 0 && time_before(jiffies, end_time));
 679out:
 680                pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
 681skip:
 682                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 683                st_engine_heartbeat_enable(engine);
 684                intel_context_put(ce);
 685
 686                if (igt_flush_test(gt->i915))
 687                        err = -EIO;
 688                if (err)
 689                        return err;
 690        }
 691
 692        return 0;
 693}
 694
 695static int __igt_reset_engine(struct intel_gt *gt, bool active)
 696{
 697        struct i915_gpu_error *global = &gt->i915->gpu_error;
 698        struct intel_engine_cs *engine;
 699        enum intel_engine_id id;
 700        struct hang h;
 701        int err = 0;
 702
 703        /* Check that we can issue an engine reset on an idle engine (no-op) */
 704
 705        if (!intel_has_reset_engine(gt))
 706                return 0;
 707
 708        if (active) {
 709                err = hang_init(&h, gt);
 710                if (err)
 711                        return err;
 712        }
 713
 714        for_each_engine(engine, gt, id) {
 715                unsigned int reset_count, reset_engine_count;
 716                unsigned long count;
 717                bool using_guc = intel_engine_uses_guc(engine);
 718                IGT_TIMEOUT(end_time);
 719
 720                if (using_guc && !active)
 721                        continue;
 722
 723                if (active && !intel_engine_can_store_dword(engine))
 724                        continue;
 725
 726                if (!wait_for_idle(engine)) {
 727                        pr_err("%s failed to idle before reset\n",
 728                               engine->name);
 729                        err = -EIO;
 730                        break;
 731                }
 732
 733                reset_count = i915_reset_count(global);
 734                reset_engine_count = i915_reset_engine_count(global, engine);
 735
 736                st_engine_heartbeat_disable(engine);
 737                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 738                count = 0;
 739                do {
 740                        struct i915_request *rq = NULL;
 741                        struct intel_selftest_saved_policy saved;
 742                        int err2;
 743
 744                        err = intel_selftest_modify_policy(engine, &saved,
 745                                                           SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
 746                        if (err) {
 747                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
 748                                break;
 749                        }
 750
 751                        if (active) {
 752                                rq = hang_create_request(&h, engine);
 753                                if (IS_ERR(rq)) {
 754                                        err = PTR_ERR(rq);
 755                                        pr_err("[%s] Create hang request failed: %d!\n",
 756                                               engine->name, err);
 757                                        goto restore;
 758                                }
 759
 760                                i915_request_get(rq);
 761                                i915_request_add(rq);
 762
 763                                if (!wait_until_running(&h, rq)) {
 764                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
 765
 766                                        pr_err("%s: Failed to start request %llx, at %x\n",
 767                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
 768                                        intel_engine_dump(engine, &p,
 769                                                          "%s\n", engine->name);
 770
 771                                        i915_request_put(rq);
 772                                        err = -EIO;
 773                                        goto restore;
 774                                }
 775                        }
 776
 777                        if (!using_guc) {
 778                                err = intel_engine_reset(engine, NULL);
 779                                if (err) {
 780                                        pr_err("intel_engine_reset(%s) failed, err:%d\n",
 781                                               engine->name, err);
 782                                        goto skip;
 783                                }
 784                        }
 785
 786                        if (rq) {
 787                                /* Ensure the reset happens and kills the engine */
 788                                err = intel_selftest_wait_for_rq(rq);
 789                                if (err)
 790                                        pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
 791                                               engine->name, rq->fence.context,
 792                                               rq->fence.seqno, rq->context->guc_id, err);
 793                        }
 794
 795skip:
 796                        if (rq)
 797                                i915_request_put(rq);
 798
 799                        if (i915_reset_count(global) != reset_count) {
 800                                pr_err("Full GPU reset recorded! (engine reset expected)\n");
 801                                err = -EINVAL;
 802                                goto restore;
 803                        }
 804
 805                        /* GuC based resets are not logged per engine */
 806                        if (!using_guc) {
 807                                if (i915_reset_engine_count(global, engine) !=
 808                                    ++reset_engine_count) {
 809                                        pr_err("%s engine reset not recorded!\n",
 810                                               engine->name);
 811                                        err = -EINVAL;
 812                                        goto restore;
 813                                }
 814                        }
 815
 816                        count++;
 817
 818restore:
 819                        err2 = intel_selftest_restore_policy(engine, &saved);
 820                        if (err2)
 821                                pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
 822                        if (err == 0)
 823                                err = err2;
 824                        if (err)
 825                                break;
 826                } while (time_before(jiffies, end_time));
 827                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
 828                st_engine_heartbeat_enable(engine);
 829                pr_info("%s: Completed %lu %s resets\n",
 830                        engine->name, count, active ? "active" : "idle");
 831
 832                if (err)
 833                        break;
 834
 835                err = igt_flush_test(gt->i915);
 836                if (err) {
 837                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
 838                        break;
 839                }
 840        }
 841
 842        if (intel_gt_is_wedged(gt)) {
 843                pr_err("GT is wedged!\n");
 844                err = -EIO;
 845        }
 846
 847        if (active)
 848                hang_fini(&h);
 849
 850        return err;
 851}
 852
 853static int igt_reset_idle_engine(void *arg)
 854{
 855        return __igt_reset_engine(arg, false);
 856}
 857
 858static int igt_reset_active_engine(void *arg)
 859{
 860        return __igt_reset_engine(arg, true);
 861}
 862
 863struct active_engine {
 864        struct task_struct *task;
 865        struct intel_engine_cs *engine;
 866        unsigned long resets;
 867        unsigned int flags;
 868};
 869
 870#define TEST_ACTIVE     BIT(0)
 871#define TEST_OTHERS     BIT(1)
 872#define TEST_SELF       BIT(2)
 873#define TEST_PRIORITY   BIT(3)
 874
 875static int active_request_put(struct i915_request *rq)
 876{
 877        int err = 0;
 878
 879        if (!rq)
 880                return 0;
 881
 882        if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
 883                GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
 884                          rq->engine->name,
 885                          rq->fence.context,
 886                          rq->fence.seqno);
 887                GEM_TRACE_DUMP();
 888
 889                intel_gt_set_wedged(rq->engine->gt);
 890                err = -EIO;
 891        }
 892
 893        i915_request_put(rq);
 894
 895        return err;
 896}
 897
 898static int active_engine(void *data)
 899{
 900        I915_RND_STATE(prng);
 901        struct active_engine *arg = data;
 902        struct intel_engine_cs *engine = arg->engine;
 903        struct i915_request *rq[8] = {};
 904        struct intel_context *ce[ARRAY_SIZE(rq)];
 905        unsigned long count;
 906        int err = 0;
 907
 908        for (count = 0; count < ARRAY_SIZE(ce); count++) {
 909                ce[count] = intel_context_create(engine);
 910                if (IS_ERR(ce[count])) {
 911                        err = PTR_ERR(ce[count]);
 912                        pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
 913                        while (--count)
 914                                intel_context_put(ce[count]);
 915                        return err;
 916                }
 917        }
 918
 919        count = 0;
 920        while (!kthread_should_stop()) {
 921                unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 922                struct i915_request *old = rq[idx];
 923                struct i915_request *new;
 924
 925                new = intel_context_create_request(ce[idx]);
 926                if (IS_ERR(new)) {
 927                        err = PTR_ERR(new);
 928                        pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
 929                        break;
 930                }
 931
 932                rq[idx] = i915_request_get(new);
 933                i915_request_add(new);
 934
 935                if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
 936                        struct i915_sched_attr attr = {
 937                                .priority =
 938                                        i915_prandom_u32_max_state(512, &prng),
 939                        };
 940                        engine->sched_engine->schedule(rq[idx], &attr);
 941                }
 942
 943                err = active_request_put(old);
 944                if (err) {
 945                        pr_err("[%s] Request put failed: %d!\n", engine->name, err);
 946                        break;
 947                }
 948
 949                cond_resched();
 950        }
 951
 952        for (count = 0; count < ARRAY_SIZE(rq); count++) {
 953                int err__ = active_request_put(rq[count]);
 954
 955                if (err)
 956                        pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
 957
 958                /* Keep the first error */
 959                if (!err)
 960                        err = err__;
 961
 962                intel_context_put(ce[count]);
 963        }
 964
 965        return err;
 966}
 967
 968static int __igt_reset_engines(struct intel_gt *gt,
 969                               const char *test_name,
 970                               unsigned int flags)
 971{
 972        struct i915_gpu_error *global = &gt->i915->gpu_error;
 973        struct intel_engine_cs *engine, *other;
 974        enum intel_engine_id id, tmp;
 975        struct hang h;
 976        int err = 0;
 977
 978        /* Check that issuing a reset on one engine does not interfere
 979         * with any other engine.
 980         */
 981
 982        if (!intel_has_reset_engine(gt))
 983                return 0;
 984
 985        if (flags & TEST_ACTIVE) {
 986                err = hang_init(&h, gt);
 987                if (err)
 988                        return err;
 989
 990                if (flags & TEST_PRIORITY)
 991                        h.ctx->sched.priority = 1024;
 992        }
 993
 994        for_each_engine(engine, gt, id) {
 995                struct active_engine threads[I915_NUM_ENGINES] = {};
 996                unsigned long device = i915_reset_count(global);
 997                unsigned long count = 0, reported;
 998                bool using_guc = intel_engine_uses_guc(engine);
 999                IGT_TIMEOUT(end_time);
1000
1001                if (flags & TEST_ACTIVE) {
1002                        if (!intel_engine_can_store_dword(engine))
1003                                continue;
1004                } else if (using_guc)
1005                        continue;
1006
1007                if (!wait_for_idle(engine)) {
1008                        pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1009                               engine->name, test_name);
1010                        err = -EIO;
1011                        break;
1012                }
1013
1014                memset(threads, 0, sizeof(threads));
1015                for_each_engine(other, gt, tmp) {
1016                        struct task_struct *tsk;
1017
1018                        threads[tmp].resets =
1019                                i915_reset_engine_count(global, other);
1020
1021                        if (other == engine && !(flags & TEST_SELF))
1022                                continue;
1023
1024                        if (other != engine && !(flags & TEST_OTHERS))
1025                                continue;
1026
1027                        threads[tmp].engine = other;
1028                        threads[tmp].flags = flags;
1029
1030                        tsk = kthread_run(active_engine, &threads[tmp],
1031                                          "igt/%s", other->name);
1032                        if (IS_ERR(tsk)) {
1033                                err = PTR_ERR(tsk);
1034                                pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1035                                goto unwind;
1036                        }
1037
1038                        threads[tmp].task = tsk;
1039                        get_task_struct(tsk);
1040                }
1041
1042                yield(); /* start all threads before we begin */
1043
1044                st_engine_heartbeat_disable_no_pm(engine);
1045                set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1046                do {
1047                        struct i915_request *rq = NULL;
1048                        struct intel_selftest_saved_policy saved;
1049                        int err2;
1050
1051                        err = intel_selftest_modify_policy(engine, &saved,
1052                                                           SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1053                        if (err) {
1054                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1055                                break;
1056                        }
1057
1058                        if (flags & TEST_ACTIVE) {
1059                                rq = hang_create_request(&h, engine);
1060                                if (IS_ERR(rq)) {
1061                                        err = PTR_ERR(rq);
1062                                        pr_err("[%s] Create hang request failed: %d!\n",
1063                                               engine->name, err);
1064                                        goto restore;
1065                                }
1066
1067                                i915_request_get(rq);
1068                                i915_request_add(rq);
1069
1070                                if (!wait_until_running(&h, rq)) {
1071                                        struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1072
1073                                        pr_err("%s: Failed to start request %llx, at %x\n",
1074                                               __func__, rq->fence.seqno, hws_seqno(&h, rq));
1075                                        intel_engine_dump(engine, &p,
1076                                                          "%s\n", engine->name);
1077
1078                                        i915_request_put(rq);
1079                                        err = -EIO;
1080                                        goto restore;
1081                                }
1082                        } else {
1083                                intel_engine_pm_get(engine);
1084                        }
1085
1086                        if (!using_guc) {
1087                                err = intel_engine_reset(engine, NULL);
1088                                if (err) {
1089                                        pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1090                                               engine->name, test_name, err);
1091                                        goto restore;
1092                                }
1093                        }
1094
1095                        if (rq) {
1096                                /* Ensure the reset happens and kills the engine */
1097                                err = intel_selftest_wait_for_rq(rq);
1098                                if (err)
1099                                        pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1100                                               engine->name, rq->fence.context,
1101                                               rq->fence.seqno, rq->context->guc_id, err);
1102                        }
1103
1104                        count++;
1105
1106                        if (rq) {
1107                                if (rq->fence.error != -EIO) {
1108                                        pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1109                                               engine->name, test_name,
1110                                               rq->fence.context,
1111                                               rq->fence.seqno, rq->context->guc_id);
1112                                        i915_request_put(rq);
1113
1114                                        GEM_TRACE_DUMP();
1115                                        intel_gt_set_wedged(gt);
1116                                        err = -EIO;
1117                                        goto restore;
1118                                }
1119
1120                                if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1121                                        struct drm_printer p =
1122                                                drm_info_printer(gt->i915->drm.dev);
1123
1124                                        pr_err("i915_reset_engine(%s:%s):"
1125                                               " failed to complete request %llx:%lld after reset\n",
1126                                               engine->name, test_name,
1127                                               rq->fence.context,
1128                                               rq->fence.seqno);
1129                                        intel_engine_dump(engine, &p,
1130                                                          "%s\n", engine->name);
1131                                        i915_request_put(rq);
1132
1133                                        GEM_TRACE_DUMP();
1134                                        intel_gt_set_wedged(gt);
1135                                        err = -EIO;
1136                                        goto restore;
1137                                }
1138
1139                                i915_request_put(rq);
1140                        }
1141
1142                        if (!(flags & TEST_ACTIVE))
1143                                intel_engine_pm_put(engine);
1144
1145                        if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1146                                struct drm_printer p =
1147                                        drm_info_printer(gt->i915->drm.dev);
1148
1149                                pr_err("i915_reset_engine(%s:%s):"
1150                                       " failed to idle after reset\n",
1151                                       engine->name, test_name);
1152                                intel_engine_dump(engine, &p,
1153                                                  "%s\n", engine->name);
1154
1155                                err = -EIO;
1156                                goto restore;
1157                        }
1158
1159restore:
1160                        err2 = intel_selftest_restore_policy(engine, &saved);
1161                        if (err2)
1162                                pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1163                        if (err == 0)
1164                                err = err2;
1165                        if (err)
1166                                break;
1167                } while (time_before(jiffies, end_time));
1168                clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1169                st_engine_heartbeat_enable_no_pm(engine);
1170
1171                pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1172                        engine->name, test_name, count);
1173
1174                /* GuC based resets are not logged per engine */
1175                if (!using_guc) {
1176                        reported = i915_reset_engine_count(global, engine);
1177                        reported -= threads[engine->id].resets;
1178                        if (reported != count) {
1179                                pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1180                                       engine->name, test_name, count, reported);
1181                                if (!err)
1182                                        err = -EINVAL;
1183                        }
1184                }
1185
1186unwind:
1187                for_each_engine(other, gt, tmp) {
1188                        int ret;
1189
1190                        if (!threads[tmp].task)
1191                                continue;
1192
1193                        ret = kthread_stop(threads[tmp].task);
1194                        if (ret) {
1195                                pr_err("kthread for other engine %s failed, err=%d\n",
1196                                       other->name, ret);
1197                                if (!err)
1198                                        err = ret;
1199                        }
1200                        put_task_struct(threads[tmp].task);
1201
1202                        /* GuC based resets are not logged per engine */
1203                        if (!using_guc) {
1204                                if (other->uabi_class != engine->uabi_class &&
1205                                    threads[tmp].resets !=
1206                                    i915_reset_engine_count(global, other)) {
1207                                        pr_err("Innocent engine %s was reset (count=%ld)\n",
1208                                               other->name,
1209                                               i915_reset_engine_count(global, other) -
1210                                               threads[tmp].resets);
1211                                        if (!err)
1212                                                err = -EINVAL;
1213                                }
1214                        }
1215                }
1216
1217                if (device != i915_reset_count(global)) {
1218                        pr_err("Global reset (count=%ld)!\n",
1219                               i915_reset_count(global) - device);
1220                        if (!err)
1221                                err = -EINVAL;
1222                }
1223
1224                if (err)
1225                        break;
1226
1227                err = igt_flush_test(gt->i915);
1228                if (err) {
1229                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1230                        break;
1231                }
1232        }
1233
1234        if (intel_gt_is_wedged(gt))
1235                err = -EIO;
1236
1237        if (flags & TEST_ACTIVE)
1238                hang_fini(&h);
1239
1240        return err;
1241}
1242
1243static int igt_reset_engines(void *arg)
1244{
1245        static const struct {
1246                const char *name;
1247                unsigned int flags;
1248        } phases[] = {
1249                { "idle", 0 },
1250                { "active", TEST_ACTIVE },
1251                { "others-idle", TEST_OTHERS },
1252                { "others-active", TEST_OTHERS | TEST_ACTIVE },
1253                {
1254                        "others-priority",
1255                        TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1256                },
1257                {
1258                        "self-priority",
1259                        TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1260                },
1261                { }
1262        };
1263        struct intel_gt *gt = arg;
1264        typeof(*phases) *p;
1265        int err;
1266
1267        for (p = phases; p->name; p++) {
1268                if (p->flags & TEST_PRIORITY) {
1269                        if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1270                                continue;
1271                }
1272
1273                err = __igt_reset_engines(arg, p->name, p->flags);
1274                if (err)
1275                        return err;
1276        }
1277
1278        return 0;
1279}
1280
1281static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1282{
1283        u32 count = i915_reset_count(&gt->i915->gpu_error);
1284
1285        intel_gt_reset(gt, mask, NULL);
1286
1287        return count;
1288}
1289
1290static int igt_reset_wait(void *arg)
1291{
1292        struct intel_gt *gt = arg;
1293        struct i915_gpu_error *global = &gt->i915->gpu_error;
1294        struct intel_engine_cs *engine = gt->engine[RCS0];
1295        struct i915_request *rq;
1296        unsigned int reset_count;
1297        struct hang h;
1298        long timeout;
1299        int err;
1300
1301        if (!engine || !intel_engine_can_store_dword(engine))
1302                return 0;
1303
1304        /* Check that we detect a stuck waiter and issue a reset */
1305
1306        igt_global_reset_lock(gt);
1307
1308        err = hang_init(&h, gt);
1309        if (err) {
1310                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1311                goto unlock;
1312        }
1313
1314        rq = hang_create_request(&h, engine);
1315        if (IS_ERR(rq)) {
1316                err = PTR_ERR(rq);
1317                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1318                goto fini;
1319        }
1320
1321        i915_request_get(rq);
1322        i915_request_add(rq);
1323
1324        if (!wait_until_running(&h, rq)) {
1325                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1326
1327                pr_err("%s: Failed to start request %llx, at %x\n",
1328                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1329                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1330
1331                intel_gt_set_wedged(gt);
1332
1333                err = -EIO;
1334                goto out_rq;
1335        }
1336
1337        reset_count = fake_hangcheck(gt, ALL_ENGINES);
1338
1339        timeout = i915_request_wait(rq, 0, 10);
1340        if (timeout < 0) {
1341                pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1342                       timeout);
1343                err = timeout;
1344                goto out_rq;
1345        }
1346
1347        if (i915_reset_count(global) == reset_count) {
1348                pr_err("No GPU reset recorded!\n");
1349                err = -EINVAL;
1350                goto out_rq;
1351        }
1352
1353out_rq:
1354        i915_request_put(rq);
1355fini:
1356        hang_fini(&h);
1357unlock:
1358        igt_global_reset_unlock(gt);
1359
1360        if (intel_gt_is_wedged(gt))
1361                return -EIO;
1362
1363        return err;
1364}
1365
1366struct evict_vma {
1367        struct completion completion;
1368        struct i915_vma *vma;
1369};
1370
1371static int evict_vma(void *data)
1372{
1373        struct evict_vma *arg = data;
1374        struct i915_address_space *vm = arg->vma->vm;
1375        struct drm_mm_node evict = arg->vma->node;
1376        int err;
1377
1378        complete(&arg->completion);
1379
1380        mutex_lock(&vm->mutex);
1381        err = i915_gem_evict_for_node(vm, &evict, 0);
1382        mutex_unlock(&vm->mutex);
1383
1384        return err;
1385}
1386
1387static int evict_fence(void *data)
1388{
1389        struct evict_vma *arg = data;
1390        int err;
1391
1392        complete(&arg->completion);
1393
1394        /* Mark the fence register as dirty to force the mmio update. */
1395        err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1396        if (err) {
1397                pr_err("Invalid Y-tiling settings; err:%d\n", err);
1398                return err;
1399        }
1400
1401        err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1402        if (err) {
1403                pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1404                return err;
1405        }
1406
1407        err = i915_vma_pin_fence(arg->vma);
1408        i915_vma_unpin(arg->vma);
1409        if (err) {
1410                pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1411                return err;
1412        }
1413
1414        i915_vma_unpin_fence(arg->vma);
1415
1416        return 0;
1417}
1418
1419static int __igt_reset_evict_vma(struct intel_gt *gt,
1420                                 struct i915_address_space *vm,
1421                                 int (*fn)(void *),
1422                                 unsigned int flags)
1423{
1424        struct intel_engine_cs *engine = gt->engine[RCS0];
1425        struct drm_i915_gem_object *obj;
1426        struct task_struct *tsk = NULL;
1427        struct i915_request *rq;
1428        struct evict_vma arg;
1429        struct hang h;
1430        unsigned int pin_flags;
1431        int err;
1432
1433        if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1434                return 0;
1435
1436        if (!engine || !intel_engine_can_store_dword(engine))
1437                return 0;
1438
1439        /* Check that we can recover an unbind stuck on a hanging request */
1440
1441        err = hang_init(&h, gt);
1442        if (err) {
1443                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1444                return err;
1445        }
1446
1447        obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1448        if (IS_ERR(obj)) {
1449                err = PTR_ERR(obj);
1450                pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1451                goto fini;
1452        }
1453
1454        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1455                err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1456                if (err) {
1457                        pr_err("Invalid X-tiling settings; err:%d\n", err);
1458                        goto out_obj;
1459                }
1460        }
1461
1462        arg.vma = i915_vma_instance(obj, vm, NULL);
1463        if (IS_ERR(arg.vma)) {
1464                err = PTR_ERR(arg.vma);
1465                pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1466                goto out_obj;
1467        }
1468
1469        rq = hang_create_request(&h, engine);
1470        if (IS_ERR(rq)) {
1471                err = PTR_ERR(rq);
1472                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1473                goto out_obj;
1474        }
1475
1476        pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1477
1478        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1479                pin_flags |= PIN_MAPPABLE;
1480
1481        err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1482        if (err) {
1483                i915_request_add(rq);
1484                pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1485                goto out_obj;
1486        }
1487
1488        if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1489                err = i915_vma_pin_fence(arg.vma);
1490                if (err) {
1491                        pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1492                        i915_vma_unpin(arg.vma);
1493                        i915_request_add(rq);
1494                        goto out_obj;
1495                }
1496        }
1497
1498        i915_vma_lock(arg.vma);
1499        err = i915_request_await_object(rq, arg.vma->obj,
1500                                        flags & EXEC_OBJECT_WRITE);
1501        if (err == 0) {
1502                err = i915_vma_move_to_active(arg.vma, rq, flags);
1503                if (err)
1504                        pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1505        } else {
1506                pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1507        }
1508
1509        i915_vma_unlock(arg.vma);
1510
1511        if (flags & EXEC_OBJECT_NEEDS_FENCE)
1512                i915_vma_unpin_fence(arg.vma);
1513        i915_vma_unpin(arg.vma);
1514
1515        i915_request_get(rq);
1516        i915_request_add(rq);
1517        if (err)
1518                goto out_rq;
1519
1520        if (!wait_until_running(&h, rq)) {
1521                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1522
1523                pr_err("%s: Failed to start request %llx, at %x\n",
1524                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1525                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1526
1527                intel_gt_set_wedged(gt);
1528                goto out_reset;
1529        }
1530
1531        init_completion(&arg.completion);
1532
1533        tsk = kthread_run(fn, &arg, "igt/evict_vma");
1534        if (IS_ERR(tsk)) {
1535                err = PTR_ERR(tsk);
1536                pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1537                tsk = NULL;
1538                goto out_reset;
1539        }
1540        get_task_struct(tsk);
1541
1542        wait_for_completion(&arg.completion);
1543
1544        if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1545                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1546
1547                pr_err("igt/evict_vma kthread did not wait\n");
1548                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1549
1550                intel_gt_set_wedged(gt);
1551                goto out_reset;
1552        }
1553
1554out_reset:
1555        igt_global_reset_lock(gt);
1556        fake_hangcheck(gt, rq->engine->mask);
1557        igt_global_reset_unlock(gt);
1558
1559        if (tsk) {
1560                struct intel_wedge_me w;
1561
1562                /* The reset, even indirectly, should take less than 10ms. */
1563                intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1564                        err = kthread_stop(tsk);
1565
1566                put_task_struct(tsk);
1567        }
1568
1569out_rq:
1570        i915_request_put(rq);
1571out_obj:
1572        i915_gem_object_put(obj);
1573fini:
1574        hang_fini(&h);
1575        if (intel_gt_is_wedged(gt))
1576                return -EIO;
1577
1578        return err;
1579}
1580
1581static int igt_reset_evict_ggtt(void *arg)
1582{
1583        struct intel_gt *gt = arg;
1584
1585        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1586                                     evict_vma, EXEC_OBJECT_WRITE);
1587}
1588
1589static int igt_reset_evict_ppgtt(void *arg)
1590{
1591        struct intel_gt *gt = arg;
1592        struct i915_ppgtt *ppgtt;
1593        int err;
1594
1595        /* aliasing == global gtt locking, covered above */
1596        if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1597                return 0;
1598
1599        ppgtt = i915_ppgtt_create(gt);
1600        if (IS_ERR(ppgtt))
1601                return PTR_ERR(ppgtt);
1602
1603        err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1604                                    evict_vma, EXEC_OBJECT_WRITE);
1605        i915_vm_put(&ppgtt->vm);
1606
1607        return err;
1608}
1609
1610static int igt_reset_evict_fence(void *arg)
1611{
1612        struct intel_gt *gt = arg;
1613
1614        return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1615                                     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1616}
1617
1618static int wait_for_others(struct intel_gt *gt,
1619                           struct intel_engine_cs *exclude)
1620{
1621        struct intel_engine_cs *engine;
1622        enum intel_engine_id id;
1623
1624        for_each_engine(engine, gt, id) {
1625                if (engine == exclude)
1626                        continue;
1627
1628                if (!wait_for_idle(engine))
1629                        return -EIO;
1630        }
1631
1632        return 0;
1633}
1634
1635static int igt_reset_queue(void *arg)
1636{
1637        struct intel_gt *gt = arg;
1638        struct i915_gpu_error *global = &gt->i915->gpu_error;
1639        struct intel_engine_cs *engine;
1640        enum intel_engine_id id;
1641        struct hang h;
1642        int err;
1643
1644        /* Check that we replay pending requests following a hang */
1645
1646        igt_global_reset_lock(gt);
1647
1648        err = hang_init(&h, gt);
1649        if (err)
1650                goto unlock;
1651
1652        for_each_engine(engine, gt, id) {
1653                struct intel_selftest_saved_policy saved;
1654                struct i915_request *prev;
1655                IGT_TIMEOUT(end_time);
1656                unsigned int count;
1657                bool using_guc = intel_engine_uses_guc(engine);
1658
1659                if (!intel_engine_can_store_dword(engine))
1660                        continue;
1661
1662                if (using_guc) {
1663                        err = intel_selftest_modify_policy(engine, &saved,
1664                                                           SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1665                        if (err) {
1666                                pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1667                                goto fini;
1668                        }
1669                }
1670
1671                prev = hang_create_request(&h, engine);
1672                if (IS_ERR(prev)) {
1673                        err = PTR_ERR(prev);
1674                        pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1675                        goto restore;
1676                }
1677
1678                i915_request_get(prev);
1679                i915_request_add(prev);
1680
1681                count = 0;
1682                do {
1683                        struct i915_request *rq;
1684                        unsigned int reset_count;
1685
1686                        rq = hang_create_request(&h, engine);
1687                        if (IS_ERR(rq)) {
1688                                err = PTR_ERR(rq);
1689                                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1690                                goto restore;
1691                        }
1692
1693                        i915_request_get(rq);
1694                        i915_request_add(rq);
1695
1696                        /*
1697                         * XXX We don't handle resetting the kernel context
1698                         * very well. If we trigger a device reset twice in
1699                         * quick succession while the kernel context is
1700                         * executing, we may end up skipping the breadcrumb.
1701                         * This is really only a problem for the selftest as
1702                         * normally there is a large interlude between resets
1703                         * (hangcheck), or we focus on resetting just one
1704                         * engine and so avoid repeatedly resetting innocents.
1705                         */
1706                        err = wait_for_others(gt, engine);
1707                        if (err) {
1708                                pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1709                                       __func__, engine->name);
1710                                i915_request_put(rq);
1711                                i915_request_put(prev);
1712
1713                                GEM_TRACE_DUMP();
1714                                intel_gt_set_wedged(gt);
1715                                goto restore;
1716                        }
1717
1718                        if (!wait_until_running(&h, prev)) {
1719                                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1720
1721                                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1722                                       __func__, engine->name,
1723                                       prev->fence.seqno, hws_seqno(&h, prev));
1724                                intel_engine_dump(engine, &p,
1725                                                  "%s\n", engine->name);
1726
1727                                i915_request_put(rq);
1728                                i915_request_put(prev);
1729
1730                                intel_gt_set_wedged(gt);
1731
1732                                err = -EIO;
1733                                goto restore;
1734                        }
1735
1736                        reset_count = fake_hangcheck(gt, BIT(id));
1737
1738                        if (prev->fence.error != -EIO) {
1739                                pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1740                                       prev->fence.error);
1741                                i915_request_put(rq);
1742                                i915_request_put(prev);
1743                                err = -EINVAL;
1744                                goto restore;
1745                        }
1746
1747                        if (rq->fence.error) {
1748                                pr_err("Fence error status not zero [%d] after unrelated reset\n",
1749                                       rq->fence.error);
1750                                i915_request_put(rq);
1751                                i915_request_put(prev);
1752                                err = -EINVAL;
1753                                goto restore;
1754                        }
1755
1756                        if (i915_reset_count(global) == reset_count) {
1757                                pr_err("No GPU reset recorded!\n");
1758                                i915_request_put(rq);
1759                                i915_request_put(prev);
1760                                err = -EINVAL;
1761                                goto restore;
1762                        }
1763
1764                        i915_request_put(prev);
1765                        prev = rq;
1766                        count++;
1767                } while (time_before(jiffies, end_time));
1768                pr_info("%s: Completed %d queued resets\n",
1769                        engine->name, count);
1770
1771                *h.batch = MI_BATCH_BUFFER_END;
1772                intel_gt_chipset_flush(engine->gt);
1773
1774                i915_request_put(prev);
1775
1776restore:
1777                if (using_guc) {
1778                        int err2 = intel_selftest_restore_policy(engine, &saved);
1779
1780                        if (err2)
1781                                pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1782                                       __func__, __LINE__, engine->name, err2);
1783                        if (err == 0)
1784                                err = err2;
1785                }
1786                if (err)
1787                        goto fini;
1788
1789                err = igt_flush_test(gt->i915);
1790                if (err) {
1791                        pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1792                        break;
1793                }
1794        }
1795
1796fini:
1797        hang_fini(&h);
1798unlock:
1799        igt_global_reset_unlock(gt);
1800
1801        if (intel_gt_is_wedged(gt))
1802                return -EIO;
1803
1804        return err;
1805}
1806
1807static int igt_handle_error(void *arg)
1808{
1809        struct intel_gt *gt = arg;
1810        struct i915_gpu_error *global = &gt->i915->gpu_error;
1811        struct intel_engine_cs *engine = gt->engine[RCS0];
1812        struct hang h;
1813        struct i915_request *rq;
1814        struct i915_gpu_coredump *error;
1815        int err;
1816
1817        /* Check that we can issue a global GPU and engine reset */
1818
1819        if (!intel_has_reset_engine(gt))
1820                return 0;
1821
1822        if (!engine || !intel_engine_can_store_dword(engine))
1823                return 0;
1824
1825        err = hang_init(&h, gt);
1826        if (err) {
1827                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1828                return err;
1829        }
1830
1831        rq = hang_create_request(&h, engine);
1832        if (IS_ERR(rq)) {
1833                err = PTR_ERR(rq);
1834                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1835                goto err_fini;
1836        }
1837
1838        i915_request_get(rq);
1839        i915_request_add(rq);
1840
1841        if (!wait_until_running(&h, rq)) {
1842                struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1843
1844                pr_err("%s: Failed to start request %llx, at %x\n",
1845                       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1846                intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1847
1848                intel_gt_set_wedged(gt);
1849
1850                err = -EIO;
1851                goto err_request;
1852        }
1853
1854        /* Temporarily disable error capture */
1855        error = xchg(&global->first_error, (void *)-1);
1856
1857        intel_gt_handle_error(gt, engine->mask, 0, NULL);
1858
1859        xchg(&global->first_error, error);
1860
1861        if (rq->fence.error != -EIO) {
1862                pr_err("Guilty request not identified!\n");
1863                err = -EINVAL;
1864                goto err_request;
1865        }
1866
1867err_request:
1868        i915_request_put(rq);
1869err_fini:
1870        hang_fini(&h);
1871        return err;
1872}
1873
1874static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1875                                     const struct igt_atomic_section *p,
1876                                     const char *mode)
1877{
1878        struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1879        int err;
1880
1881        GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1882                  engine->name, mode, p->name);
1883
1884        if (t->func)
1885                tasklet_disable(t);
1886        if (strcmp(p->name, "softirq"))
1887                local_bh_disable();
1888        p->critical_section_begin();
1889
1890        err = __intel_engine_reset_bh(engine, NULL);
1891
1892        p->critical_section_end();
1893        if (strcmp(p->name, "softirq"))
1894                local_bh_enable();
1895        if (t->func) {
1896                tasklet_enable(t);
1897                tasklet_hi_schedule(t);
1898        }
1899
1900        if (err)
1901                pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1902                       engine->name, mode, p->name);
1903
1904        return err;
1905}
1906
1907static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1908                                   const struct igt_atomic_section *p)
1909{
1910        struct i915_request *rq;
1911        struct hang h;
1912        int err;
1913
1914        err = __igt_atomic_reset_engine(engine, p, "idle");
1915        if (err)
1916                return err;
1917
1918        err = hang_init(&h, engine->gt);
1919        if (err) {
1920                pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1921                return err;
1922        }
1923
1924        rq = hang_create_request(&h, engine);
1925        if (IS_ERR(rq)) {
1926                err = PTR_ERR(rq);
1927                pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1928                goto out;
1929        }
1930
1931        i915_request_get(rq);
1932        i915_request_add(rq);
1933
1934        if (wait_until_running(&h, rq)) {
1935                err = __igt_atomic_reset_engine(engine, p, "active");
1936        } else {
1937                pr_err("%s(%s): Failed to start request %llx, at %x\n",
1938                       __func__, engine->name,
1939                       rq->fence.seqno, hws_seqno(&h, rq));
1940                intel_gt_set_wedged(engine->gt);
1941                err = -EIO;
1942        }
1943
1944        if (err == 0) {
1945                struct intel_wedge_me w;
1946
1947                intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1948                        i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1949                if (intel_gt_is_wedged(engine->gt))
1950                        err = -EIO;
1951        }
1952
1953        i915_request_put(rq);
1954out:
1955        hang_fini(&h);
1956        return err;
1957}
1958
1959static int igt_reset_engines_atomic(void *arg)
1960{
1961        struct intel_gt *gt = arg;
1962        const typeof(*igt_atomic_phases) *p;
1963        int err = 0;
1964
1965        /* Check that the engines resets are usable from atomic context */
1966
1967        if (!intel_has_reset_engine(gt))
1968                return 0;
1969
1970        if (intel_uc_uses_guc_submission(&gt->uc))
1971                return 0;
1972
1973        igt_global_reset_lock(gt);
1974
1975        /* Flush any requests before we get started and check basics */
1976        if (!igt_force_reset(gt))
1977                goto unlock;
1978
1979        for (p = igt_atomic_phases; p->name; p++) {
1980                struct intel_engine_cs *engine;
1981                enum intel_engine_id id;
1982
1983                for_each_engine(engine, gt, id) {
1984                        err = igt_atomic_reset_engine(engine, p);
1985                        if (err)
1986                                goto out;
1987                }
1988        }
1989
1990out:
1991        /* As we poke around the guts, do a full reset before continuing. */
1992        igt_force_reset(gt);
1993unlock:
1994        igt_global_reset_unlock(gt);
1995
1996        return err;
1997}
1998
1999int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2000{
2001        static const struct i915_subtest tests[] = {
2002                SUBTEST(igt_hang_sanitycheck),
2003                SUBTEST(igt_reset_nop),
2004                SUBTEST(igt_reset_nop_engine),
2005                SUBTEST(igt_reset_idle_engine),
2006                SUBTEST(igt_reset_active_engine),
2007                SUBTEST(igt_reset_fail_engine),
2008                SUBTEST(igt_reset_engines),
2009                SUBTEST(igt_reset_engines_atomic),
2010                SUBTEST(igt_reset_queue),
2011                SUBTEST(igt_reset_wait),
2012                SUBTEST(igt_reset_evict_ggtt),
2013                SUBTEST(igt_reset_evict_ppgtt),
2014                SUBTEST(igt_reset_evict_fence),
2015                SUBTEST(igt_handle_error),
2016        };
2017        struct intel_gt *gt = &i915->gt;
2018        intel_wakeref_t wakeref;
2019        int err;
2020
2021        if (!intel_has_gpu_reset(gt))
2022                return 0;
2023
2024        if (intel_gt_is_wedged(gt))
2025                return -EIO; /* we're long past hope of a successful reset */
2026
2027        wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2028
2029        err = intel_gt_live_subtests(tests, gt);
2030
2031        intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2032
2033        return err;
2034}
2035